-
Notifications
You must be signed in to change notification settings - Fork 4
/
preprocess.py
156 lines (103 loc) · 3.25 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import codecs
from rdflib import Graph
def open_triples(path):
return codecs.open(path, "r", "utf-8")
def read_triples(path):
return codecs.open(path, "r", "utf-8").read()
# takes the path of the .ttl file and returns the dictionary
# whose keys are the subject and values are the object.
def read_triples_manuel(path):
result = dict()
file = codecs.open(path, "r", "utf-8")
line = file.readline()
while line != '':
if line.startswith('<'):
splitted_line = line.split()
subject = splitted_line[0][1:-1]
object = splitted_line[2][1:-1]
result[subject] = object
line = file.readline()
return result
def parse_triples(input_triple, input_format='n3'):
g = Graph()
return g.parse(data=input_triple, format=input_format)
# takes the rdflib graph and writes its subject and object
# to the given file.
def write_triple(input_triple, path):
file = open(path, 'w')
count = 0
print('writing is started...')
print(len(input_triple))
for subj, pred, obj in input_triple:
file.write(str(subj) + ' ' + str(obj) + '\n')
if count%100000 == 0:
print(count, 'nodes are written..')
count += 1
print('count', count)
file.close()
def triple2dict(input_triple):
result = dict()
for subj, pred, obj in input_triple:
result[str(subj)] = str(obj)
return result
# nodes_ids is dictionary: keys are urls, values are ids of them.
# edges is list of tuples where two nodes have an edge.
def filter_edges_by_nodes(nodes_ids, edges):
filtered_edges = list()
filtered_edges_ids = list()
for nodes in edges:
node1, node2 = nodes[0], nodes[1]
try:
id1, id2 = nodes_ids[node1], nodes_ids[node2]
except KeyError:
continue
filtered_edges.append((node1, node2))
filtered_edges_ids.append((id1, id2))
return filtered_edges, filtered_edges_ids
def read_dict(path):
result = dict()
file = codecs.open(path, 'r')
line = file.readline()
while line != '':
splitted = line.split()
line = file.readline()
try:
result[str(splitted[0])] = splitted[1:]
except IndexError:
continue
return result
def read_lookup(path):
result = dict()
file = codecs.open(path, 'r')
line = file.readline()
while line != '':
splitted = line.split()
line = file.readline()
try:
result[str(splitted[0])] = int(splitted[1])
except IndexError:
continue
return result
def read_edges(path):
edges = list()
file = codecs.open(path, 'r')
line = file.readline()
while line != '':
splitted = line.split()
line = file.readline()
edges.append((int(splitted[0]), int(splitted[1])))
return edges
def write_edges(edgelist, path):
file = codecs.open(path, 'w')
for edge in edgelist:
file.write(str(edge[0]) + ' ' + str(edge[1]) + '\n')
file.close()
def read_list(path):
data = list()
file = codecs.open(path, 'r')
line = file.readline()
while line != '':
splitted = line.split()
data.append(splitted[0])
line = file.readline()
return data