-
Notifications
You must be signed in to change notification settings - Fork 0
/
readFiles.py
33 lines (26 loc) · 795 Bytes
/
readFiles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# -*- coding: utf-8 -*-
"""
Created on Fri Nov 16 11:58:38 2018
@author: kstoutjesdijk
"""
#werkt als er 1 doc is
#==============================================================================
# from bs4 import BeautifulSoup
# infile = open("test","r")
# contents = infile.read()
# soup = BeautifulSoup(contents, 'xml')
# titles = soup.find_all('TEXT')
# for title in titles:
# #print(title.get_text())
# pass
#==============================================================================
from bs4 import BeautifulSoup
infile = open("test2","r").read()
infile = '<root>'+infile+'</root>'
soup = BeautifulSoup(infile, 'xml')
docid = soup.find_all('DOCID')
for id in docid:
print(id.get_text())
texts = soup.find_all('TEXT')
for text in texts:
print(text.get_text())