forked from DSKSD/DeepNLP-models-Pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Corpus.pyx
51 lines (41 loc) · 1.32 KB
/
Corpus.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import numpy as np
import scipy.sparse as sp
from libcpp.vector cimport vector
from libcpp.string cimport string
from libcpp.map cimport map
# Text 전처리 cpp extention 모듈
def make_dictionary(vocab):
cdef int vocab_size = len(vocab)
cdef int i
dictionary={}
inv_dictionary={}
for i in range(vocab_size):
dictionary[vocab[i]] = i
inv_dictionary[i]=vocab[i]
return dictionary,inv_dictionary
def make_window_data(sents,window_size):
pass
def make_coo_matrix(corpus,dictionary):
cdef int matrix_size = len(dictionary)
pass
def getBatch_FromBucket(batch_size,buckets):
i=0
bucket_mask =[False for _ in range(len(buckets))]
indices = [[0,batch_size] for _ in range(len(buckets))]
is_done=False
while is_done==False:
batch = buckets[i][indices[i][0]:indices[i][1]]
temp = indices[i][1]
indices[i][1]= indices[i][1]+batch_size
indices[i][0] = temp
i = (i+1)%len(buckets)
while bucket_mask[i]:
i = (i+1)%len(buckets)
if indices[i][1]>len(buckets[i]):
bucket_mask[i]= True
if bucket_mask.count(True)==len(buckets):
is_done=True
else:
while bucket_mask[i]:
i = (i+1)%len(buckets)
yield batch