Skip to content

Commit

Permalink
add files
Browse files Browse the repository at this point in the history
  • Loading branch information
Jay-Ppark committed Dec 17, 2021
0 parents commit d858144
Show file tree
Hide file tree
Showing 3 changed files with 1,472 additions and 0 deletions.
33 changes: 33 additions & 0 deletions preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import pandas as pd
import re
import csv

def del_html(text):
# 엔터 삭제
text = re.sub("\n", '', text)

# 괄호 삭제
regex = r'\[[^)]*\]'
text = re.sub(regex, '', text)

regex = '<.*?>'
text = re.sub(regex, ' ', text)

# 기타 전처리
text = re.sub("\xa0",'',text)
text = re.sub(r'\ *\ ',' ',text) # 공백 여러개 하나로

return text

data = pd.read_csv('./clear/writing/writing_paragraph.csv')
new_data = data['paragraph_txt']
str_list = []
header="paragraph_txt"
for i in new_data:
new_str = del_html(i)
str_list.append(new_str)

output_df = pd.DataFrame({'paragraph_txt' : str_list})
output_df.to_csv('new_writing_para.csv', index=False)
#new_data = del_html(data['paragraph_txt'])
#print(new_data)
49 changes: 49 additions & 0 deletions spell.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import pandas as pd
import re
import csv
from hanspell import spell_checker
from tqdm import tqdm
from sklearn.metrics import f1_score
from pykospacing import Spacing

data = pd.read_csv('./new_agree_para.csv')
new_data = data['paragraph_txt']
f1_list = []
cnt=1
for i in tqdm(new_data):
print(cnt)
tmp_list = i.split('#@문장구분#')
spell_list = []
space_list = []
for t in tmp_list :
try:
spacing = Spacing()
tmp = spacing(t)
space_list.append(tmp)
spelled_sent = spell_checker.check(tmp)
hanspell_sent = spelled_sent.checked
spell_list.append(hanspell_sent)
except:
a = t.replace("\t","")
spacing = Spacing()
tmp = spacing(a)
space_list.append(tmp)
spelled_sent = spell_checker.check(tmp)
hanspell_sent = spelled_sent.checked
spell_list.append(hanspell_sent)

cor_list=[]
pred_list=[]
for cor, pred in zip(spell_list,space_list):
c = cor.split(' ')
p = pred.split(' ')
for i,j in zip(c,p):
cor_list.append(i)
pred_list.append(j)

f1 = f1_score(cor_list, pred_list, average='micro')
f1_list.append(f1)
cnt+=1

output_df = pd.DataFrame({'f1' : f1_list})
output_df.to_csv('spell_s_agree_para.csv', index=False)
Loading

0 comments on commit d858144

Please sign in to comment.