add files

quarter-100 · Dec 17, 2021 · d858144 · d858144
commit d858144
Show file tree

Hide file tree

Showing 3 changed files with 1,472 additions and 0 deletions.
diff --git a/preprocessing.py b/preprocessing.py
@@ -0,0 +1,33 @@
+import pandas as pd
+import re
+import csv
+
+def del_html(text):
+    # 엔터 삭제
+    text = re.sub("\n", '', text)
+
+    # 괄호 삭제
+    regex = r'\[[^)]*\]'
+    text = re.sub(regex, '', text)
+
+    regex = '<.*?>'
+    text = re.sub(regex, ' ', text)
+
+    # 기타 전처리
+    text = re.sub("\xa0",'',text)
+    text = re.sub(r'\ *\ ',' ',text) # 공백 여러개 하나로
+
+    return text
+
+data = pd.read_csv('./clear/writing/writing_paragraph.csv')
+new_data = data['paragraph_txt']
+str_list = []
+header="paragraph_txt"
+for i in new_data:
+    new_str = del_html(i)
+    str_list.append(new_str)
+
+output_df = pd.DataFrame({'paragraph_txt' : str_list})
+output_df.to_csv('new_writing_para.csv', index=False)
+#new_data = del_html(data['paragraph_txt'])
+#print(new_data)
diff --git a/spell.py b/spell.py
@@ -0,0 +1,49 @@
+import pandas as pd
+import re
+import csv
+from hanspell import spell_checker
+from tqdm import tqdm
+from sklearn.metrics import f1_score
+from pykospacing import Spacing
+
+data = pd.read_csv('./new_agree_para.csv')
+new_data = data['paragraph_txt']
+f1_list = []
+cnt=1
+for i in tqdm(new_data):
+    print(cnt)
+    tmp_list = i.split('#@문장구분#')
+    spell_list = []
+    space_list = []
+    for t in tmp_list :
+        try:
+            spacing = Spacing()
+            tmp = spacing(t)
+            space_list.append(tmp)
+            spelled_sent = spell_checker.check(tmp)
+            hanspell_sent = spelled_sent.checked
+            spell_list.append(hanspell_sent)
+        except:
+            a = t.replace("\t","")
+            spacing = Spacing()
+            tmp = spacing(a)
+            space_list.append(tmp)
+            spelled_sent = spell_checker.check(tmp)
+            hanspell_sent = spelled_sent.checked
+            spell_list.append(hanspell_sent)
+
+    cor_list=[]
+    pred_list=[]
+    for cor, pred in zip(spell_list,space_list):
+        c = cor.split(' ')
+        p = pred.split(' ')
+        for i,j in zip(c,p):
+            cor_list.append(i)
+            pred_list.append(j)
+
+    f1 = f1_score(cor_list, pred_list, average='micro')
+    f1_list.append(f1)
+    cnt+=1
+
+output_df = pd.DataFrame({'f1' : f1_list})
+output_df.to_csv('spell_s_agree_para.csv', index=False)