-
Notifications
You must be signed in to change notification settings - Fork 1
/
collect_articles.py
87 lines (69 loc) · 2.34 KB
/
collect_articles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/env python3
"""
Script for scraping news articles from websites.
Example usage for downloading all articles listed in a csv and storing them in the
directory /tmp/news-articles:
$ python3 collect_articles.py urls.txt /tmp/news-articles
Scraped articles are stored in JSON files with the following structure:
{
"text": "<ARTICLE-TEXT>",
"title": "<ARTICLE_TITLE>",
"date": "<ARTICLE_DATE>",
"url": "<ARTICLE_URL">
...
}
Each article is saved to its own file in the specified directory.
File names are automatically chosen based on the respective article URLs.
In each JSON file you can find the contents ("text"), title, publication date,
URL and additional site-dependent metadata.
"""
import argparse
import logging
from scraping import change_log_file_path, read_sources, scrape_and_store_articles
logger = logging.getLogger('scraping')
def main():
parser = argparse.ArgumentParser(epilog=__doc__, formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument(
'source_path',
help='Path to a file containing a list of source URLs.'
)
parser.add_argument(
'output_path',
help='Path to the directory in which the articles should be stored.',
)
parser.add_argument(
'--log-path',
help='Path to log file (default: /tmp/scraping.log)',
default='/tmp/scraping.log'
)
parser.add_argument(
'-m', '--enable-mp',
help='Enable multiprocessing (faster, but logging may be broken)',
action='store_true'
)
parser.add_argument(
'-n', '--num-workers',
help='Number of concurrent worker processes/threads',
type=int
)
args = parser.parse_args()
source_path = args.source_path
output_path = args.output_path
log_path = args.log_path
enable_mp = args.enable_mp
num_workers = args.num_workers
# Set up logging: Override log file path
change_log_file_path(log_path)
# Parse site
logger.info('Finding articles in file...')
article_sources = read_sources(source_path)
if len(article_sources) == 0:
raise RuntimeError(f'No articles found. Aborting...')
scrape_and_store_articles(
article_sources,
parent_dir=output_path,
multiprocessing=enable_mp,
num_workers=num_workers
)
if __name__ == '__main__':
main()