Skip to content

Commit 13be66d

Browse files
committed
updated
1 parent 6047e9d commit 13be66d

File tree

1 file changed

+244
-0
lines changed

1 file changed

+244
-0
lines changed
Lines changed: 244 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,244 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"<a href=\"https://colab.research.google.com/github/minsuk-heo/python_tutorial/data_science/nlp/blob/master/jupyter_notebooks/word2vec_gensim.ipynb\" target=\"_parent\"><img src=\"https://camo.githubusercontent.com/52feade06f2fecbf006889a904d221e6a730c194/68747470733a2f2f636f6c61622e72657365617263682e676f6f676c652e636f6d2f6173736574732f636f6c61622d62616467652e737667\" alt=\"Open In Colab\" data-canonical-src=\"https://colab.research.google.com/assets/colab-badge.svg\"></a>"
8+
]
9+
},
10+
{
11+
"cell_type": "markdown",
12+
"metadata": {},
13+
"source": [
14+
"# pretrained Word2Vec download"
15+
]
16+
},
17+
{
18+
"cell_type": "code",
19+
"execution_count": 16,
20+
"metadata": {},
21+
"outputs": [
22+
{
23+
"name": "stdout",
24+
"output_type": "stream",
25+
"text": [
26+
"--2020-01-20 22:14:56-- https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\n",
27+
"Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.166.53\n",
28+
"Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.166.53|:443... connected.\n",
29+
"HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable\n",
30+
"\n",
31+
" The file is already fully retrieved; nothing to do.\n",
32+
"\n"
33+
]
34+
}
35+
],
36+
"source": [
37+
"!wget -P . -c \"https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\""
38+
]
39+
},
40+
{
41+
"cell_type": "code",
42+
"execution_count": 1,
43+
"metadata": {},
44+
"outputs": [],
45+
"source": [
46+
"import gensim"
47+
]
48+
},
49+
{
50+
"cell_type": "code",
51+
"execution_count": 2,
52+
"metadata": {},
53+
"outputs": [],
54+
"source": [
55+
"# load pretrained word2vec\n",
56+
"model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)"
57+
]
58+
},
59+
{
60+
"cell_type": "code",
61+
"execution_count": 3,
62+
"metadata": {},
63+
"outputs": [
64+
{
65+
"data": {
66+
"text/plain": [
67+
"[('coffees', 0.721267819404602),\n",
68+
" ('gourmet_coffee', 0.7057087421417236),\n",
69+
" ('Coffee', 0.6900454759597778),\n",
70+
" ('o_joe', 0.6891065835952759),\n",
71+
" ('Starbucks_coffee', 0.6874972581863403)]"
72+
]
73+
},
74+
"execution_count": 3,
75+
"metadata": {},
76+
"output_type": "execute_result"
77+
}
78+
],
79+
"source": [
80+
"# similar words\n",
81+
"model.most_similar(positive=['friend'], topn=5)"
82+
]
83+
},
84+
{
85+
"cell_type": "code",
86+
"execution_count": 7,
87+
"metadata": {},
88+
"outputs": [
89+
{
90+
"data": {
91+
"text/plain": [
92+
"[('queen', 0.7118192911148071)]"
93+
]
94+
},
95+
"execution_count": 7,
96+
"metadata": {},
97+
"output_type": "execute_result"
98+
}
99+
],
100+
"source": [
101+
"# king + woman - man = queen\n",
102+
"model.most_similar(positive=['king', 'woman'], negative=['man'], topn=1)"
103+
]
104+
},
105+
{
106+
"cell_type": "code",
107+
"execution_count": 5,
108+
"metadata": {},
109+
"outputs": [
110+
{
111+
"data": {
112+
"text/plain": [
113+
"300"
114+
]
115+
},
116+
"execution_count": 5,
117+
"metadata": {},
118+
"output_type": "execute_result"
119+
}
120+
],
121+
"source": [
122+
"# Word2Vec vector dimension\n",
123+
"len(model['friend'])"
124+
]
125+
},
126+
{
127+
"cell_type": "code",
128+
"execution_count": 6,
129+
"metadata": {},
130+
"outputs": [
131+
{
132+
"data": {
133+
"text/plain": [
134+
"array([-1.61132812e-01, -1.36718750e-01, -3.73046875e-01, 6.17187500e-01,\n",
135+
" 1.08398438e-01, 2.72216797e-02, 1.00097656e-01, -1.51367188e-01,\n",
136+
" -1.66015625e-02, 3.80859375e-01, 6.54296875e-02, -1.31835938e-01,\n",
137+
" 2.53906250e-01, 9.08203125e-02, 2.86865234e-02, 2.53906250e-01,\n",
138+
" -2.05078125e-01, 1.64062500e-01, 2.20703125e-01, -1.74804688e-01,\n",
139+
" -2.01171875e-01, 1.30859375e-01, -3.22265625e-02, -2.41210938e-01,\n",
140+
" -3.19824219e-02, 2.48046875e-01, -2.37304688e-01, 2.89062500e-01,\n",
141+
" 1.64794922e-02, 1.29394531e-02, 1.72119141e-02, -3.53515625e-01,\n",
142+
" -1.66992188e-01, -5.90820312e-02, -2.81250000e-01, 9.94873047e-03,\n",
143+
" -1.94091797e-02, -3.22265625e-01, 1.73339844e-02, -5.83496094e-02,\n",
144+
" -2.59765625e-01, 1.42669678e-03, 5.81054688e-02, 1.13769531e-01,\n",
145+
" -8.64257812e-02, 3.54003906e-02, -4.29687500e-01, 2.86865234e-03,\n",
146+
" 6.98852539e-03, 1.80664062e-01, -1.79687500e-01, 2.95410156e-02,\n",
147+
" -1.56250000e-01, -2.08007812e-01, -9.08203125e-02, 4.15039062e-03,\n",
148+
" 1.07421875e-01, 3.12500000e-01, -1.04980469e-01, -3.24218750e-01,\n",
149+
" -1.24023438e-01, -7.05718994e-04, -1.05957031e-01, 2.12890625e-01,\n",
150+
" 1.12304688e-01, -1.58203125e-01, -1.67968750e-01, -9.71679688e-02,\n",
151+
" 1.53320312e-01, -1.11328125e-01, 3.22265625e-01, 2.28515625e-01,\n",
152+
" 3.20312500e-01, -1.72119141e-02, -4.57031250e-01, 3.23486328e-03,\n",
153+
" -1.76757812e-01, -5.00488281e-02, 3.05175781e-02, -2.75390625e-01,\n",
154+
" -1.65039062e-01, -3.56445312e-02, 7.95898438e-02, 1.35742188e-01,\n",
155+
" -8.64257812e-02, -7.32421875e-02, 1.36718750e-01, 2.33398438e-01,\n",
156+
" 7.95898438e-02, 1.32446289e-02, -4.71191406e-02, 1.01074219e-01,\n",
157+
" 2.37304688e-01, -1.81640625e-01, -2.14843750e-01, -1.65039062e-01,\n",
158+
" -1.66015625e-02, -1.51367188e-01, 3.06640625e-01, -2.40234375e-01,\n",
159+
" -2.29492188e-01, -1.29882812e-01, 8.97216797e-03, 1.97265625e-01,\n",
160+
" 7.47070312e-02, -1.64031982e-03, 1.54296875e-01, -6.80541992e-03,\n",
161+
" -1.12304688e-01, -7.61718750e-02, -8.74023438e-02, -1.31835938e-01,\n",
162+
" -2.94921875e-01, -2.46093750e-01, 6.15234375e-02, -1.23046875e-01,\n",
163+
" -8.34960938e-02, -8.39843750e-02, -1.61132812e-02, -4.30297852e-03,\n",
164+
" -4.05273438e-02, -2.84423828e-02, 1.36718750e-01, 2.13623047e-02,\n",
165+
" -2.81250000e-01, 2.40234375e-01, -3.75976562e-02, -9.66796875e-02,\n",
166+
" 1.28906250e-01, 1.43554688e-01, -1.37695312e-01, -1.38549805e-02,\n",
167+
" -4.12597656e-02, -4.51660156e-02, -3.75976562e-02, 1.89453125e-01,\n",
168+
" 5.32226562e-02, 1.17675781e-01, -8.25195312e-02, -1.56250000e-01,\n",
169+
" 1.47460938e-01, -2.63671875e-01, -2.79296875e-01, -4.31640625e-01,\n",
170+
" -5.90820312e-02, 2.74658203e-03, 2.87109375e-01, -2.71606445e-03,\n",
171+
" -2.46093750e-01, 2.74658203e-02, -9.08203125e-02, 6.54296875e-02,\n",
172+
" -1.94335938e-01, -2.16064453e-02, 2.77343750e-01, 5.98144531e-02,\n",
173+
" 2.33154297e-02, -1.37695312e-01, -5.39062500e-01, -1.64794922e-02,\n",
174+
" -1.25976562e-01, -1.36718750e-01, 3.02734375e-02, 2.50000000e-01,\n",
175+
" 5.53131104e-04, 1.36718750e-01, 2.96875000e-01, -5.10253906e-02,\n",
176+
" 9.08203125e-02, -2.39257812e-01, 1.35742188e-01, 1.11328125e-01,\n",
177+
" 1.96289062e-01, -1.54296875e-01, -3.37890625e-01, -3.36914062e-02,\n",
178+
" -9.47265625e-02, -1.69921875e-01, -1.04003906e-01, 1.46484375e-01,\n",
179+
" 4.54101562e-02, -4.12109375e-01, -2.47070312e-01, -6.10351562e-03,\n",
180+
" 4.55078125e-01, -2.35595703e-02, 4.93164062e-02, 1.42578125e-01,\n",
181+
" 2.66113281e-02, 4.11987305e-03, -7.27539062e-02, 2.53906250e-02,\n",
182+
" -3.39355469e-02, 7.91015625e-02, 2.87109375e-01, 3.88671875e-01,\n",
183+
" -1.58691406e-02, -8.44726562e-02, -1.15722656e-01, -1.22558594e-01,\n",
184+
" -1.02050781e-01, 1.32812500e-01, 2.21679688e-01, -2.03125000e-01,\n",
185+
" 7.91015625e-02, 1.69677734e-02, 2.16796875e-01, 2.33398438e-01,\n",
186+
" -2.08984375e-01, -1.36718750e-01, -2.45117188e-01, 3.93066406e-02,\n",
187+
" -1.80664062e-01, 1.37695312e-01, 1.50390625e-01, -3.90625000e-02,\n",
188+
" -1.32812500e-01, 2.75878906e-02, -1.78710938e-01, 1.55273438e-01,\n",
189+
" 1.36718750e-01, -1.14257812e-01, -2.79296875e-01, -7.86132812e-02,\n",
190+
" 3.08593750e-01, -5.32226562e-02, -1.65039062e-01, 5.83496094e-02,\n",
191+
" 2.19726562e-01, -1.25000000e-01, 6.10351562e-02, -3.39355469e-02,\n",
192+
" -3.16406250e-01, 2.14843750e-01, -4.12597656e-02, -1.94335938e-01,\n",
193+
" 7.76367188e-02, -5.21850586e-03, 6.93359375e-02, 2.18750000e-01,\n",
194+
" 1.71875000e-01, -1.97265625e-01, 1.07910156e-01, 8.25195312e-02,\n",
195+
" 3.39355469e-02, -1.15722656e-01, -2.02941895e-03, 4.83398438e-02,\n",
196+
" 1.50390625e-01, -2.73437500e-01, -9.61914062e-02, 3.39843750e-01,\n",
197+
" 2.98828125e-01, 1.32812500e-01, -3.68652344e-02, -3.08593750e-01,\n",
198+
" 2.94189453e-02, -1.31835938e-01, -7.12890625e-02, -2.57873535e-03,\n",
199+
" -1.17187500e-01, 6.34765625e-03, -1.66992188e-01, 2.01171875e-01,\n",
200+
" -1.33789062e-01, -1.77734375e-01, -1.09863281e-01, 5.06591797e-03,\n",
201+
" -1.07910156e-01, -1.30859375e-01, -5.17578125e-02, 2.57812500e-01,\n",
202+
" 5.41992188e-02, -6.34765625e-03, 3.00598145e-03, 7.95898438e-02,\n",
203+
" -2.37304688e-01, -8.05664062e-02, 6.07910156e-02, 9.27734375e-02,\n",
204+
" 1.65039062e-01, -1.22558594e-01, 1.88476562e-01, 2.50000000e-01,\n",
205+
" -1.42578125e-01, -7.91015625e-02, -1.78710938e-01, 1.52343750e-01,\n",
206+
" -7.76367188e-02, 2.42187500e-01, 2.56347656e-02, -1.26953125e-01,\n",
207+
" -1.25000000e-01, -3.19824219e-02, -1.27929688e-01, 1.49414062e-01,\n",
208+
" -1.34277344e-02, 6.59179688e-02, 2.17773438e-01, 2.02148438e-01],\n",
209+
" dtype=float32)"
210+
]
211+
},
212+
"execution_count": 6,
213+
"metadata": {},
214+
"output_type": "execute_result"
215+
}
216+
],
217+
"source": [
218+
"# print word2vec\n",
219+
"model['friend']"
220+
]
221+
}
222+
],
223+
"metadata": {
224+
"kernelspec": {
225+
"display_name": "Python 3",
226+
"language": "python",
227+
"name": "python3"
228+
},
229+
"language_info": {
230+
"codemirror_mode": {
231+
"name": "ipython",
232+
"version": 3
233+
},
234+
"file_extension": ".py",
235+
"mimetype": "text/x-python",
236+
"name": "python",
237+
"nbconvert_exporter": "python",
238+
"pygments_lexer": "ipython3",
239+
"version": "3.7.5"
240+
}
241+
},
242+
"nbformat": 4,
243+
"nbformat_minor": 2
244+
}

0 commit comments

Comments
 (0)