-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
10 changed files
with
245 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
'''20. JSONデータの読み込み | ||
Wikipedia記事のJSONファイルを読み込み,「イギリス」に関する記事本文を表示せよ. | ||
問題21-29では,ここで抽出した記事本文に対して実行せよ.''' | ||
|
||
|
||
import gzip | ||
import json | ||
f = gzip.open('jawiki-country.json.gz') | ||
f_uk = open('jawiki-uk.txt', 'w', encoding='utf-8') | ||
for line in f: | ||
obj = json.loads(line) #json.loads()でJSON文字列を辞書に変換 | ||
if obj['title'] == 'イギリス': | ||
f_uk.write(obj['text']) | ||
f_uk.close() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
'''21. カテゴリ名を含む行を抽出 | ||
記事中でカテゴリ名を宣言している行を抽出せよ. | ||
''' | ||
|
||
import re | ||
f_uk = open('jawiki-uk.txt', 'r', encoding='utf-8') | ||
f_out = open('knock21_output.txt', 'w', encoding='utf-8') | ||
data = f_uk.readlines() | ||
for line in data: | ||
rs = re.search(r'\[\[Category:.*\]\]', line) | ||
if rs: | ||
f_out.write(rs.group() + '\n') | ||
f_uk.close() | ||
f_out.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
'''22. カテゴリ名の抽出 | ||
記事のカテゴリ名を(行単位ではなく名前で)抽出せよ.''' | ||
|
||
|
||
import re | ||
f_uk = open('jawiki-uk.txt', 'r', encoding='utf-8') | ||
f_out = open('knock22_output.txt', 'w', encoding='utf-8') | ||
data = f_uk.readlines() | ||
for line in data: | ||
line = line.strip() | ||
|
||
rs2 = re.search(r'\[\[Category:(.*?)(\|.*)?\]\]$', line) | ||
if rs2: | ||
f_out.write(rs2.group(1) + '\n') | ||
f_uk.close() | ||
f_out.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
'''23. セクション構造 | ||
記事中に含まれるセクション名とそのレベル(例えば”== セクション名 ==”なら1)を表示せよ.''' | ||
|
||
|
||
import re | ||
|
||
f_uk = open('jawiki-uk.txt', 'r', encoding='utf-8') | ||
f_out = open('knock23_output.txt', 'w', encoding='utf-8') | ||
data = f_uk.readlines() | ||
for line in data: | ||
line = line.strip() | ||
rs = re.match(r'(\=+)(.*?)(\=+)$', line) | ||
if rs: | ||
section_name = rs.group(2).strip() #group(1)イコール列 (2)文字列 (3)イコール列 6行目の()で作ったグループに対応? | ||
section_level = len(rs.group(1)) - 1 | ||
#https://www.javadrive.jp/python/regex/index4.html | ||
f_out.write(f'レベル{section_level} {section_name}\n') | ||
f_uk.close() | ||
f_out.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
''' | ||
24. ファイル参照の抽出 | ||
記事から参照されているメディアファイルをすべて抜き出せ. | ||
''' | ||
|
||
import re | ||
f = open('jawiki-uk.txt', 'r', encoding='utf-8') | ||
f_out = open('knock24_output.txt', 'w', encoding='utf-8') | ||
data = f.readlines() | ||
for line in data: | ||
line = line.strip() | ||
rs = re.findall(r'\[\[ファイル:(.*?)\]\]$', line) | ||
if rs: | ||
refer = rs[0].split('|')[0] | ||
f_out.write(f'{refer}\n') | ||
f.close() | ||
f_out.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
''' | ||
25. テンプレートの抽出 | ||
記事中に含まれる「基礎情報」テンプレートのフィールド名と値を抽出し,辞書オブジェクトとして格納せよ. | ||
''' | ||
|
||
import re | ||
f = open('jawiki-uk.txt', 'r', encoding='utf-8') | ||
f_out = open('knock25_output.txt', 'w', encoding='utf-8') | ||
data = f.readlines() | ||
basis_info = dict() | ||
for line in data: | ||
line = line.strip() | ||
rs = re.match(r'\|(.*?)(\s\=\s*)(.*?)$', line) | ||
if rs: | ||
basis_info[rs.group(1)] = rs.group(3) | ||
|
||
for my_key, my_value in basis_info.items(): | ||
f_out.write(f'{my_key} : {my_value}\n') | ||
f.close() | ||
f_out.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
''' | ||
26. 強調マークアップの除去 | ||
25の処理時に,テンプレートの値からMediaWikiの強調マークアップ(弱い強調,強調,強い強調のすべて)を除去してテキストに変換せよ | ||
''' | ||
|
||
import re | ||
f = open('jawiki-uk.txt', 'r', encoding='utf-8') | ||
f_out = open('knock26_output.txt', 'w', encoding='utf-8') | ||
data = f.readlines() | ||
for line in data: | ||
line = line.strip() | ||
rs = re.match(r'\|(.*?)(\s\=\s*)(.*?)$', line) | ||
if rs: | ||
field_name = rs.group(3) | ||
rs_empha_delete = re.sub(r'\'+', '', field_name) | ||
f_out.write(f'{rs.group(1)} : {rs_empha_delete}\n') | ||
|
||
f.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
''' | ||
27. 内部リンクの除去 | ||
26の処理に加えて,テンプレートの値からMediaWikiの内部リンクマークアップを除去し,テキストに変換せよ | ||
''' | ||
|
||
import re | ||
|
||
f = open('jawiki-uk.txt', 'r', encoding='utf-8') | ||
f_out = open('knock27_output.txt', 'w', encoding='utf-8') | ||
data = f.readlines() | ||
for line in data: | ||
line = line.strip() | ||
rs = re.match(r'\|(.*?)\s\=\s*(.*?)$', line) # 25 基礎情報の抽出 | ||
if rs: | ||
field_name = rs.group(2) | ||
rs_empha_delete = re.sub(r'\'+', '', field_name) # 26 強調マークアップの除去 | ||
|
||
rs2 = re.search(r'(.*)\[\[(.*?)\|(.*?)\]\](.*)', rs_empha_delete) | ||
if rs2: | ||
if re.match('\[\[ファイル:', rs2.group()): # [[ファイル は内部リンクマークアップでないので分けて処理 | ||
f_out.write(f'{rs.group(1)} : {rs_empha_delete}\n') | ||
else: | ||
field_str = rs2.group(1) + rs2.group(3) + rs2.group(4) | ||
field_str = re.sub(r'\[\[|\]\]', '', field_str) | ||
f_out.write(f'{rs.group(1)} : {field_str}\n') | ||
|
||
else: | ||
rs_inter_link_delete = re.sub(r'\[\[|\]\]', '', rs_empha_delete) | ||
f_out.write(f'{rs.group(1)} : {rs_inter_link_delete}\n') | ||
|
||
f.close() | ||
f_out.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
''' | ||
28. MediaWikiマークアップの除去 | ||
27の処理に加えて,テンプレートの値からMediaWikiマークアップを可能な限り除去し,国の基本情報を整形せよ. | ||
''' | ||
|
||
import re | ||
|
||
def basic_dict(filename, pattern): | ||
with open(filename, 'r', encoding='utf-8') as f: | ||
dict = {} | ||
flag_start = False | ||
for line in f: | ||
if re.search(r'{{基礎情報\s*国',line): | ||
flag_start = True | ||
continue | ||
if flag_start: | ||
if re.search(r'^}}$', line): | ||
break | ||
templete = re.search(pattern, line) | ||
if templete: | ||
key = templete.group(1).strip() | ||
dict[key] = templete.group(2).strip('') | ||
# print(type(dict[key])) # str | ||
|
||
return dict | ||
|
||
# 内部リンクマークアップを除去 | ||
def remove_link(x): | ||
x = re.sub(r'\[\[[^\|\]]+\|[^{}\|\]]+\|([^\]]+)\]\]', r'\1', x) | ||
x = re.sub(r'\[\[[^\|\]]+\|([^\]]+)\]\]', r'\1', x) | ||
x = re.sub(r'\[\[([^\]]+)\]\]', r'\1', x) | ||
return x | ||
|
||
def remove_markups(x): | ||
x = re.sub(r'{{.*\|.*\|([^}]*)}}', r'\1', x) | ||
x = re.sub(r'<([^>]*)( .*|)>.*</\1>', '', x) | ||
x = re.sub(r'\{\{0\}\}', '', x) | ||
return x | ||
|
||
if __name__ == '__main__': | ||
|
||
file = 'jawiki-uk.txt' | ||
pattern = r'\|(.*?)\s=\s*(.+)' | ||
basic_info = basic_dict(file, pattern) | ||
dict2 = { | ||
key : re.sub(r"''+", '',value) | ||
for key, value in basic_info.items() | ||
} | ||
|
||
dict3 = { | ||
key : remove_link(value) | ||
for key, value in dict2.items() | ||
} | ||
|
||
dict4 = { | ||
key : remove_markups(value) | ||
for key, value in dict3.items() | ||
} | ||
for k, v in dict4.items(): | ||
print(k, v) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
from knock28 import * | ||
import re | ||
import requests | ||
|
||
def get_url(text): | ||
url_file = text['国旗画像'].replace(' ', '_') | ||
url = 'https://commons.wikimedia.org/w/api.php?action=query&titles=File:' + url_file + '&prop=imageinfo&iiprop=url&format=json' | ||
data = requests.get(url) | ||
return re.search(r'"url":"(.+?)"', data.text).group(1) | ||
|
||
|
||
if __name__ == '__main__': | ||
|
||
file = 'jawiki-uk.txt' | ||
pattern = r'\|(.*?)\s=\s*(.+)' | ||
basic_info = basic_dict(file, pattern) | ||
dict2 = { | ||
key : re.sub(r"''+", '',value) | ||
for key, value in basic_info.items() | ||
} | ||
|
||
dict3 = { | ||
key : remove_link(value) | ||
for key, value in dict2.items() | ||
} | ||
|
||
dict4 = { | ||
key : remove_markups(value) | ||
for key, value in dict3.items() | ||
} | ||
|
||
res = get_url(dict4) | ||
|
||
print(res) |