-
Notifications
You must be signed in to change notification settings - Fork 6
/
fastext_wordembedding.py
34 lines (26 loc) · 1.06 KB
/
fastext_wordembedding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
from tqdm import tqdm
import numpy as np
import fasttext as ft
import json
import pickle
from pdb import set_trace
emb_path = '/home/skumar/.nlp_wordembeddings/cc.en.300.bin'
word_map_path = '/home/skumar/DataScience/Projects_Section/Projects_Working/Image_Captioning_Pytorch/flicker8k-dataset/Vocab_5_cap_per_img_2_min_word_freq.json' # word map index dict path
embExport_pkl_path = '/home/skumar/DataScience/Projects_Section/Projects_Working/Image_Captioning_Pytorch/flicker8k-dataset/Fastext_embedd_wordMap.pkl'
emb_dim = 300
# get words in the wordmap with index
with open(word_map_path,'r') as j:
word_map = json.load(j)
# create a dictionary of words and corresponding verctor array
word_emb = np.zeros((len(word_map),emb_dim))
missing = ['NA']*len(word_map)
# load fasttext word vectors
en_vecs = ft.load_model(str(emb_path))
for i,k in tqdm(enumerate(word_map)):
if k in en_vecs.get_words():
word_emb[i] = en_vecs.get_word_vector(k)
else:
missing[i] = k
print(missing)
with open(embExport_pkl_path,'wb') as f:
pickle.dump(word_emb, f, 2)