-
Notifications
You must be signed in to change notification settings - Fork 0
/
Train.py
25 lines (22 loc) · 649 Bytes
/
Train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import pandas as pd
import numpy as np
import seaborn as sns #visualisation
import matplotlib.pyplot as plt #visualisation
sns.set(color_codes=True)
from bs4 import BeautifulSoup
import requests
import spacy
import re
import numpy as np
def url_to_string(content):
html = content
soup = BeautifulSoup(html, 'lxml')
for script in soup(["script", "style", 'aside']):
script.extract()
return " ".join(re.split(r'[\n\t]+', soup.get_text()))
text=pd.read_csv("train.txt",delimiter="\t")
ny = url_to_string(text)
nlp = spacy.load('en')
article = nlp(ny)
newlabels = [(x,x.label,x.label_) for x in article.ents]
print(newlabels)