submit chapter03

tmu-nlp · Jul 22, 2022 · 85fbd6e · 85fbd6e
1 parent 93e494a
commit 85fbd6e
Show file tree

Hide file tree

Showing 10 changed files with 245 additions and 0 deletions.
diff --git a/wei/chapter03/knock20.py b/wei/chapter03/knock20.py
@@ -0,0 +1,15 @@
+'''20. JSONデータの読み込み
+Wikipedia記事のJSONファイルを読み込み，「イギリス」に関する記事本文を表示せよ．
+問題21-29では，ここで抽出した記事本文に対して実行せよ．'''
+
+
+import gzip
+import json
+f = gzip.open('jawiki-country.json.gz')
+f_uk = open('jawiki-uk.txt', 'w', encoding='utf-8')
+for line in f:
+    obj = json.loads(line)     #json.loads()でJSON文字列を辞書に変換
+    if obj['title'] == 'イギリス':
+        f_uk.write(obj['text'])
+f_uk.close()
+
diff --git a/wei/chapter03/knock21.py b/wei/chapter03/knock21.py
@@ -0,0 +1,14 @@
+'''21. カテゴリ名を含む行を抽出
+記事中でカテゴリ名を宣言している行を抽出せよ．
+'''
+
+import re
+f_uk = open('jawiki-uk.txt', 'r', encoding='utf-8')
+f_out = open('knock21_output.txt', 'w', encoding='utf-8')
+data = f_uk.readlines()
+for line in data:
+    rs = re.search(r'\[\[Category:.*\]\]', line)
+    if rs:
+        f_out.write(rs.group() + '\n')
+f_uk.close()
+f_out.close()
diff --git a/wei/chapter03/knock22.py b/wei/chapter03/knock22.py
@@ -0,0 +1,16 @@
+'''22. カテゴリ名の抽出
+記事のカテゴリ名を（行単位ではなく名前で）抽出せよ．'''
+
+
+import re
+f_uk = open('jawiki-uk.txt', 'r', encoding='utf-8')
+f_out = open('knock22_output.txt', 'w', encoding='utf-8')
+data = f_uk.readlines()
+for line in data:
+    line = line.strip()
+
+    rs2 = re.search(r'\[\[Category:(.*?)(\|.*)?\]\]$', line)
+    if rs2:
+        f_out.write(rs2.group(1) + '\n')
+f_uk.close()
+f_out.close()
diff --git a/wei/chapter03/knock23.py b/wei/chapter03/knock23.py
@@ -0,0 +1,19 @@
+'''23. セクション構造
+記事中に含まれるセクション名とそのレベル（例えば”== セクション名 ==”なら1）を表示せよ．'''
+
+
+import re
+
+f_uk = open('jawiki-uk.txt', 'r', encoding='utf-8')
+f_out = open('knock23_output.txt', 'w', encoding='utf-8')
+data = f_uk.readlines()
+for line in data:
+    line = line.strip()
+    rs = re.match(r'(\=+)(.*?)(\=+)$', line)
+    if rs:
+        section_name = rs.group(2).strip() #group(1)イコール列 (2)文字列 (3)イコール列  6行目の()で作ったグループに対応？
+        section_level = len(rs.group(1)) - 1
+                #https://www.javadrive.jp/python/regex/index4.html
+        f_out.write(f'レベル{section_level} {section_name}\n')
+f_uk.close()
+f_out.close()
diff --git a/wei/chapter03/knock24.py b/wei/chapter03/knock24.py
@@ -0,0 +1,17 @@
+'''
+24. ファイル参照の抽出
+記事から参照されているメディアファイルをすべて抜き出せ．
+'''
+
+import re
+f = open('jawiki-uk.txt', 'r', encoding='utf-8')
+f_out = open('knock24_output.txt', 'w', encoding='utf-8')
+data = f.readlines()
+for line in data:
+    line = line.strip()
+    rs = re.findall(r'\[\[ファイル:(.*?)\]\]$', line)
+    if rs:
+        refer = rs[0].split('|')[0]
+        f_out.write(f'{refer}\n')
+f.close()
+f_out.close()
diff --git a/wei/chapter03/knock25.py b/wei/chapter03/knock25.py
@@ -0,0 +1,20 @@
+'''
+25. テンプレートの抽出
+記事中に含まれる「基礎情報」テンプレートのフィールド名と値を抽出し，辞書オブジェクトとして格納せよ．
+'''
+
+import re
+f = open('jawiki-uk.txt', 'r', encoding='utf-8')
+f_out = open('knock25_output.txt', 'w', encoding='utf-8')
+data = f.readlines()
+basis_info = dict()
+for line in data:
+    line = line.strip()
+    rs = re.match(r'\|(.*?)(\s\=\s*)(.*?)$', line)
+    if rs:
+        basis_info[rs.group(1)] = rs.group(3)
+
+for my_key, my_value in basis_info.items():
+    f_out.write(f'{my_key} : {my_value}\n')
+f.close()
+f_out.close()
diff --git a/wei/chapter03/knock26.py b/wei/chapter03/knock26.py
@@ -0,0 +1,18 @@
+'''
+26. 強調マークアップの除去
+25の処理時に，テンプレートの値からMediaWikiの強調マークアップ（弱い強調，強調，強い強調のすべて）を除去してテキストに変換せよ
+'''
+
+import re
+f = open('jawiki-uk.txt', 'r', encoding='utf-8')
+f_out = open('knock26_output.txt', 'w', encoding='utf-8')
+data = f.readlines()
+for line in data:
+    line = line.strip()
+    rs = re.match(r'\|(.*?)(\s\=\s*)(.*?)$', line)
+    if rs:
+        field_name = rs.group(3)
+        rs_empha_delete = re.sub(r'\'+', '', field_name)
+        f_out.write(f'{rs.group(1)} : {rs_empha_delete}\n')
+
+f.close()
diff --git a/wei/chapter03/knock27.py b/wei/chapter03/knock27.py
@@ -0,0 +1,32 @@
+'''
+27. 内部リンクの除去
+26の処理に加えて，テンプレートの値からMediaWikiの内部リンクマークアップを除去し，テキストに変換せよ
+'''
+
+import re
+
+f = open('jawiki-uk.txt', 'r', encoding='utf-8')
+f_out = open('knock27_output.txt', 'w', encoding='utf-8')
+data = f.readlines()
+for line in data:
+    line = line.strip()
+    rs = re.match(r'\|(.*?)\s\=\s*(.*?)$', line)  # 25 基礎情報の抽出
+    if rs:
+        field_name = rs.group(2)
+        rs_empha_delete = re.sub(r'\'+', '', field_name)  # 26 強調マークアップの除去
+
+        rs2 = re.search(r'(.*)\[\[(.*?)\|(.*?)\]\](.*)', rs_empha_delete)
+        if rs2:
+            if re.match('\[\[ファイル:', rs2.group()):        # [[ファイル は内部リンクマークアップでないので分けて処理
+                f_out.write(f'{rs.group(1)} : {rs_empha_delete}\n')
+            else:
+                field_str = rs2.group(1) + rs2.group(3) + rs2.group(4)
+                field_str = re.sub(r'\[\[|\]\]', '', field_str)
+                f_out.write(f'{rs.group(1)} : {field_str}\n')
+
+        else:
+            rs_inter_link_delete = re.sub(r'\[\[|\]\]', '', rs_empha_delete)
+            f_out.write(f'{rs.group(1)} : {rs_inter_link_delete}\n')
+
+f.close()
+f_out.close()
diff --git a/wei/chapter03/knock28.py b/wei/chapter03/knock28.py
@@ -0,0 +1,60 @@
+'''
+28. MediaWikiマークアップの除去
+27の処理に加えて，テンプレートの値からMediaWikiマークアップを可能な限り除去し，国の基本情報を整形せよ．
+'''
+
+import re
+
+def basic_dict(filename, pattern):
+    with open(filename, 'r', encoding='utf-8') as f:
+        dict = {}
+        flag_start = False
+        for line in f:
+            if re.search(r'{{基礎情報\s*国',line):
+                flag_start = True
+                continue
+            if flag_start:
+                if re.search(r'^}}$', line):
+                    break
+            templete = re.search(pattern, line)
+            if templete:
+                key = templete.group(1).strip()
+                dict[key] = templete.group(2).strip('')
+                # print(type(dict[key]))                          # str
+
+    return dict
+
+# 内部リンクマークアップを除去
+def remove_link(x):
+    x = re.sub(r'\[\[[^\|\]]+\|[^{}\|\]]+\|([^\]]+)\]\]', r'\1', x)
+    x = re.sub(r'\[\[[^\|\]]+\|([^\]]+)\]\]', r'\1', x)
+    x = re.sub(r'\[\[([^\]]+)\]\]', r'\1', x)
+    return x
+
+def remove_markups(x):
+    x = re.sub(r'{{.*\|.*\|([^}]*)}}', r'\1', x)
+    x = re.sub(r'<([^>]*)( .*|)>.*</\1>', '', x)
+    x = re.sub(r'\{\{0\}\}', '', x)
+    return x
+
+if __name__ == '__main__':
+
+    file = 'jawiki-uk.txt'
+    pattern = r'\|(.*?)\s=\s*(.+)'
+    basic_info = basic_dict(file, pattern)
+    dict2 = {
+        key : re.sub(r"''+", '',value)
+        for key, value in basic_info.items()
+    }
+
+    dict3 = {
+        key : remove_link(value)
+        for key, value in dict2.items()
+    }
+
+    dict4 = {
+        key : remove_markups(value)
+        for key, value in dict3.items()
+    }
+    for k, v in dict4.items():
+        print(k, v)
diff --git a/wei/chapter03/knock29.py b/wei/chapter03/knock29.py
@@ -0,0 +1,34 @@
+from knock28 import *
+import re
+import requests
+
+def get_url(text):
+    url_file = text['国旗画像'].replace(' ', '_')
+    url = 'https://commons.wikimedia.org/w/api.php?action=query&titles=File:' + url_file + '&prop=imageinfo&iiprop=url&format=json'
+    data = requests.get(url)
+    return re.search(r'"url":"(.+?)"', data.text).group(1)
+
+
+if __name__ == '__main__':
+
+    file = 'jawiki-uk.txt'
+    pattern = r'\|(.*?)\s=\s*(.+)'
+    basic_info = basic_dict(file, pattern)
+    dict2 = {
+        key : re.sub(r"''+", '',value)
+        for key, value in basic_info.items()
+    }
+
+    dict3 = {
+        key : remove_link(value)
+        for key, value in dict2.items()
+    }
+
+    dict4 = {
+        key : remove_markups(value)
+        for key, value in dict3.items()
+    }
+
+    res = get_url(dict4)
+
+    print(res)