Skip to content

Commit

Permalink
submit chapter03
Browse files Browse the repository at this point in the history
  • Loading branch information
Vita112 committed Jul 22, 2022
1 parent 93e494a commit 85fbd6e
Show file tree
Hide file tree
Showing 10 changed files with 245 additions and 0 deletions.
15 changes: 15 additions & 0 deletions wei/chapter03/knock20.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
'''20. JSONデータの読み込み
Wikipedia記事のJSONファイルを読み込み,「イギリス」に関する記事本文を表示せよ.
問題21-29では,ここで抽出した記事本文に対して実行せよ.'''


import gzip
import json
f = gzip.open('jawiki-country.json.gz')
f_uk = open('jawiki-uk.txt', 'w', encoding='utf-8')
for line in f:
obj = json.loads(line) #json.loads()でJSON文字列を辞書に変換
if obj['title'] == 'イギリス':
f_uk.write(obj['text'])
f_uk.close()

14 changes: 14 additions & 0 deletions wei/chapter03/knock21.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
'''21. カテゴリ名を含む行を抽出
記事中でカテゴリ名を宣言している行を抽出せよ.
'''

import re
f_uk = open('jawiki-uk.txt', 'r', encoding='utf-8')
f_out = open('knock21_output.txt', 'w', encoding='utf-8')
data = f_uk.readlines()
for line in data:
rs = re.search(r'\[\[Category:.*\]\]', line)
if rs:
f_out.write(rs.group() + '\n')
f_uk.close()
f_out.close()
16 changes: 16 additions & 0 deletions wei/chapter03/knock22.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
'''22. カテゴリ名の抽出
記事のカテゴリ名を(行単位ではなく名前で)抽出せよ.'''


import re
f_uk = open('jawiki-uk.txt', 'r', encoding='utf-8')
f_out = open('knock22_output.txt', 'w', encoding='utf-8')
data = f_uk.readlines()
for line in data:
line = line.strip()

rs2 = re.search(r'\[\[Category:(.*?)(\|.*)?\]\]$', line)
if rs2:
f_out.write(rs2.group(1) + '\n')
f_uk.close()
f_out.close()
19 changes: 19 additions & 0 deletions wei/chapter03/knock23.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
'''23. セクション構造
記事中に含まれるセクション名とそのレベル(例えば”== セクション名 ==”なら1)を表示せよ.'''


import re

f_uk = open('jawiki-uk.txt', 'r', encoding='utf-8')
f_out = open('knock23_output.txt', 'w', encoding='utf-8')
data = f_uk.readlines()
for line in data:
line = line.strip()
rs = re.match(r'(\=+)(.*?)(\=+)$', line)
if rs:
section_name = rs.group(2).strip() #group(1)イコール列 (2)文字列 (3)イコール列 6行目の()で作ったグループに対応?
section_level = len(rs.group(1)) - 1
#https://www.javadrive.jp/python/regex/index4.html
f_out.write(f'レベル{section_level} {section_name}\n')
f_uk.close()
f_out.close()
17 changes: 17 additions & 0 deletions wei/chapter03/knock24.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
'''
24. ファイル参照の抽出
記事から参照されているメディアファイルをすべて抜き出せ.
'''

import re
f = open('jawiki-uk.txt', 'r', encoding='utf-8')
f_out = open('knock24_output.txt', 'w', encoding='utf-8')
data = f.readlines()
for line in data:
line = line.strip()
rs = re.findall(r'\[\[ファイル:(.*?)\]\]$', line)
if rs:
refer = rs[0].split('|')[0]
f_out.write(f'{refer}\n')
f.close()
f_out.close()
20 changes: 20 additions & 0 deletions wei/chapter03/knock25.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
'''
25. テンプレートの抽出
記事中に含まれる「基礎情報」テンプレートのフィールド名と値を抽出し,辞書オブジェクトとして格納せよ.
'''

import re
f = open('jawiki-uk.txt', 'r', encoding='utf-8')
f_out = open('knock25_output.txt', 'w', encoding='utf-8')
data = f.readlines()
basis_info = dict()
for line in data:
line = line.strip()
rs = re.match(r'\|(.*?)(\s\=\s*)(.*?)$', line)
if rs:
basis_info[rs.group(1)] = rs.group(3)

for my_key, my_value in basis_info.items():
f_out.write(f'{my_key} : {my_value}\n')
f.close()
f_out.close()
18 changes: 18 additions & 0 deletions wei/chapter03/knock26.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
'''
26. 強調マークアップの除去
25の処理時に,テンプレートの値からMediaWikiの強調マークアップ(弱い強調,強調,強い強調のすべて)を除去してテキストに変換せよ
'''

import re
f = open('jawiki-uk.txt', 'r', encoding='utf-8')
f_out = open('knock26_output.txt', 'w', encoding='utf-8')
data = f.readlines()
for line in data:
line = line.strip()
rs = re.match(r'\|(.*?)(\s\=\s*)(.*?)$', line)
if rs:
field_name = rs.group(3)
rs_empha_delete = re.sub(r'\'+', '', field_name)
f_out.write(f'{rs.group(1)} : {rs_empha_delete}\n')

f.close()
32 changes: 32 additions & 0 deletions wei/chapter03/knock27.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
'''
27. 内部リンクの除去
26の処理に加えて,テンプレートの値からMediaWikiの内部リンクマークアップを除去し,テキストに変換せよ
'''

import re

f = open('jawiki-uk.txt', 'r', encoding='utf-8')
f_out = open('knock27_output.txt', 'w', encoding='utf-8')
data = f.readlines()
for line in data:
line = line.strip()
rs = re.match(r'\|(.*?)\s\=\s*(.*?)$', line) # 25 基礎情報の抽出
if rs:
field_name = rs.group(2)
rs_empha_delete = re.sub(r'\'+', '', field_name) # 26 強調マークアップの除去

rs2 = re.search(r'(.*)\[\[(.*?)\|(.*?)\]\](.*)', rs_empha_delete)
if rs2:
if re.match('\[\[ファイル:', rs2.group()): # [[ファイル は内部リンクマークアップでないので分けて処理
f_out.write(f'{rs.group(1)} : {rs_empha_delete}\n')
else:
field_str = rs2.group(1) + rs2.group(3) + rs2.group(4)
field_str = re.sub(r'\[\[|\]\]', '', field_str)
f_out.write(f'{rs.group(1)} : {field_str}\n')

else:
rs_inter_link_delete = re.sub(r'\[\[|\]\]', '', rs_empha_delete)
f_out.write(f'{rs.group(1)} : {rs_inter_link_delete}\n')

f.close()
f_out.close()
60 changes: 60 additions & 0 deletions wei/chapter03/knock28.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
'''
28. MediaWikiマークアップの除去
27の処理に加えて,テンプレートの値からMediaWikiマークアップを可能な限り除去し,国の基本情報を整形せよ.
'''

import re

def basic_dict(filename, pattern):
with open(filename, 'r', encoding='utf-8') as f:
dict = {}
flag_start = False
for line in f:
if re.search(r'{{基礎情報\s*国',line):
flag_start = True
continue
if flag_start:
if re.search(r'^}}$', line):
break
templete = re.search(pattern, line)
if templete:
key = templete.group(1).strip()
dict[key] = templete.group(2).strip('')
# print(type(dict[key])) # str

return dict

# 内部リンクマークアップを除去
def remove_link(x):
x = re.sub(r'\[\[[^\|\]]+\|[^{}\|\]]+\|([^\]]+)\]\]', r'\1', x)
x = re.sub(r'\[\[[^\|\]]+\|([^\]]+)\]\]', r'\1', x)
x = re.sub(r'\[\[([^\]]+)\]\]', r'\1', x)
return x

def remove_markups(x):
x = re.sub(r'{{.*\|.*\|([^}]*)}}', r'\1', x)
x = re.sub(r'<([^>]*)( .*|)>.*</\1>', '', x)
x = re.sub(r'\{\{0\}\}', '', x)
return x

if __name__ == '__main__':

file = 'jawiki-uk.txt'
pattern = r'\|(.*?)\s=\s*(.+)'
basic_info = basic_dict(file, pattern)
dict2 = {
key : re.sub(r"''+", '',value)
for key, value in basic_info.items()
}

dict3 = {
key : remove_link(value)
for key, value in dict2.items()
}

dict4 = {
key : remove_markups(value)
for key, value in dict3.items()
}
for k, v in dict4.items():
print(k, v)
34 changes: 34 additions & 0 deletions wei/chapter03/knock29.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from knock28 import *
import re
import requests

def get_url(text):
url_file = text['国旗画像'].replace(' ', '_')
url = 'https://commons.wikimedia.org/w/api.php?action=query&titles=File:' + url_file + '&prop=imageinfo&iiprop=url&format=json'
data = requests.get(url)
return re.search(r'"url":"(.+?)"', data.text).group(1)


if __name__ == '__main__':

file = 'jawiki-uk.txt'
pattern = r'\|(.*?)\s=\s*(.+)'
basic_info = basic_dict(file, pattern)
dict2 = {
key : re.sub(r"''+", '',value)
for key, value in basic_info.items()
}

dict3 = {
key : remove_link(value)
for key, value in dict2.items()
}

dict4 = {
key : remove_markups(value)
for key, value in dict3.items()
}

res = get_url(dict4)

print(res)

0 comments on commit 85fbd6e

Please sign in to comment.