/
keyedvectors.py
2120 lines (1769 loc) · 89.9 KB
/
keyedvectors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Author: Gensim Contributors
# Copyright (C) 2018 RaRe Technologies s.r.o.
# Licensed under the GNU LGPL v2.1 - https://www.gnu.org/licenses/old-licenses/lgpl-2.1.en.html
"""This module implements word vectors, and more generally sets of vectors keyed by lookup tokens/ints,
and various similarity look-ups.
Since trained word vectors are independent from the way they were trained (:class:`~gensim.models.word2vec.Word2Vec`,
:class:`~gensim.models.fasttext.FastText` etc), they can be represented by a standalone structure,
as implemented in this module.
The structure is called "KeyedVectors" and is essentially a mapping between *keys*
and *vectors*. Each vector is identified by its lookup key, most often a short string token, so this is usually
a mapping between {str => 1D numpy array}.
The key is, in the original motivating case, a word (so the mapping maps words to 1D vectors),
but for some models, the key can also correspond to a document, a graph node etc.
(Because some applications may maintain their own integral identifiers, compact and contiguous
starting at zero, this class also supports use of plain ints as keys – in that case using them as literal
pointers to the position of the desired vector in the underlying array, and saving the overhead of
a lookup map entry.)
Why use KeyedVectors instead of a full model?
=============================================
+---------------------------+--------------+------------+-------------------------------------------------------------+
| capability | KeyedVectors | full model | note |
+---------------------------+--------------+------------+-------------------------------------------------------------+
| continue training vectors | ❌ | ✅ | You need the full model to train or update vectors. |
+---------------------------+--------------+------------+-------------------------------------------------------------+
| smaller objects | ✅ | ❌ | KeyedVectors are smaller and need less RAM, because they |
| | | | don't need to store the model state that enables training. |
+---------------------------+--------------+------------+-------------------------------------------------------------+
| save/load from native | | | Vectors exported by the Facebook and Google tools |
| fasttext/word2vec format | ✅ | ❌ | do not support further training, but you can still load |
| | | | them into KeyedVectors. |
+---------------------------+--------------+------------+-------------------------------------------------------------+
| append new vectors | ✅ | ✅ | Add new-vector entries to the mapping dynamically. |
+---------------------------+--------------+------------+-------------------------------------------------------------+
| concurrency | ✅ | ✅ | Thread-safe, allows concurrent vector queries. |
+---------------------------+--------------+------------+-------------------------------------------------------------+
| shared RAM | ✅ | ✅ | Multiple processes can re-use the same data, keeping only |
| | | | a single copy in RAM using |
| | | | `mmap <https://en.wikipedia.org/wiki/Mmap>`_. |
+---------------------------+--------------+------------+-------------------------------------------------------------+
| fast load | ✅ | ✅ | Supports `mmap <https://en.wikipedia.org/wiki/Mmap>`_ |
| | | | to load data from disk instantaneously. |
+---------------------------+--------------+------------+-------------------------------------------------------------+
TL;DR: the main difference is that KeyedVectors do not support further training.
On the other hand, by shedding the internal data structures necessary for training, KeyedVectors offer a smaller RAM
footprint and a simpler interface.
How to obtain word vectors?
===========================
Train a full model, then access its `model.wv` property, which holds the standalone keyed vectors.
For example, using the Word2Vec algorithm to train the vectors
.. sourcecode:: pycon
>>> from gensim.test.utils import lee_corpus_list
>>> from gensim.models import Word2Vec
>>>
>>> model = Word2Vec(lee_corpus_list, vector_size=24, epochs=100)
>>> word_vectors = model.wv
Persist the word vectors to disk with
.. sourcecode:: pycon
>>> from gensim.models import KeyedVectors
>>>
>>> word_vectors.save('vectors.kv')
>>> reloaded_word_vectors = KeyedVectors.load('vectors.kv')
The vectors can also be instantiated from an existing file on disk
in the original Google's word2vec C format as a KeyedVectors instance
.. sourcecode:: pycon
>>> from gensim.test.utils import datapath
>>>
>>> wv_from_text = KeyedVectors.load_word2vec_format(datapath('word2vec_pre_kv_c'), binary=False) # C text format
>>> wv_from_bin = KeyedVectors.load_word2vec_format(datapath("euclidean_vectors.bin"), binary=True) # C bin format
What can I do with word vectors?
================================
You can perform various syntactic/semantic NLP word tasks with the trained vectors.
Some of them are already built-in
.. sourcecode:: pycon
>>> import gensim.downloader as api
>>>
>>> word_vectors = api.load("glove-wiki-gigaword-100") # load pre-trained word-vectors from gensim-data
>>>
>>> # Check the "most similar words", using the default "cosine similarity" measure.
>>> result = word_vectors.most_similar(positive=['woman', 'king'], negative=['man'])
>>> most_similar_key, similarity = result[0] # look at the first match
>>> print(f"{most_similar_key}: {similarity:.4f}")
queen: 0.7699
>>>
>>> # Use a different similarity measure: "cosmul".
>>> result = word_vectors.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])
>>> most_similar_key, similarity = result[0] # look at the first match
>>> print(f"{most_similar_key}: {similarity:.4f}")
queen: 0.8965
>>>
>>> print(word_vectors.doesnt_match("breakfast cereal dinner lunch".split()))
cereal
>>>
>>> similarity = word_vectors.similarity('woman', 'man')
>>> similarity > 0.8
True
>>>
>>> result = word_vectors.similar_by_word("cat")
>>> most_similar_key, similarity = result[0] # look at the first match
>>> print(f"{most_similar_key}: {similarity:.4f}")
dog: 0.8798
>>>
>>> sentence_obama = 'Obama speaks to the media in Illinois'.lower().split()
>>> sentence_president = 'The president greets the press in Chicago'.lower().split()
>>>
>>> similarity = word_vectors.wmdistance(sentence_obama, sentence_president)
>>> print(f"{similarity:.4f}")
3.4893
>>>
>>> distance = word_vectors.distance("media", "media")
>>> print(f"{distance:.1f}")
0.0
>>>
>>> similarity = word_vectors.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant'])
>>> print(f"{similarity:.4f}")
0.7067
>>>
>>> vector = word_vectors['computer'] # numpy vector of a word
>>> vector.shape
(100,)
>>>
>>> vector = word_vectors.wv.get_vector('office', norm=True)
>>> vector.shape
(100,)
Correlation with human opinion on word similarity
.. sourcecode:: pycon
>>> from gensim.test.utils import datapath
>>>
>>> similarities = model.wv.evaluate_word_pairs(datapath('wordsim353.tsv'))
And on word analogies
.. sourcecode:: pycon
>>> analogy_scores = model.wv.evaluate_word_analogies(datapath('questions-words.txt'))
and so on.
"""
import logging
import sys
import itertools
import warnings
from numbers import Integral
from typing import Iterable
from numpy import (
dot, float32 as REAL, double, zeros, vstack, ndarray,
sum as np_sum, prod, argmax, dtype, ascontiguousarray, frombuffer,
)
import numpy as np
from scipy import stats
from scipy.spatial.distance import cdist
from gensim import utils, matutils # utility fnc for pickling, common scipy operations etc
from gensim.corpora.dictionary import Dictionary
from gensim.utils import deprecated
logger = logging.getLogger(__name__)
_KEY_TYPES = (str, int, np.integer)
_EXTENDED_KEY_TYPES = (str, int, np.integer, np.ndarray)
def _ensure_list(value):
"""Ensure that the specified value is wrapped in a list, for those supported cases
where we also accept a single key or vector."""
if value is None:
return []
if isinstance(value, _KEY_TYPES) or (isinstance(value, ndarray) and len(value.shape) == 1):
return [value]
if isinstance(value, ndarray) and len(value.shape) == 2:
return list(value)
return value
class KeyedVectors(utils.SaveLoad):
def __init__(self, vector_size, count=0, dtype=np.float32, mapfile_path=None):
"""Mapping between keys (such as words) and vectors for :class:`~gensim.models.Word2Vec`
and related models.
Used to perform operations on the vectors such as vector lookup, distance, similarity etc.
To support the needs of specific models and other downstream uses, you can also set
additional attributes via the :meth:`~gensim.models.keyedvectors.KeyedVectors.set_vecattr`
and :meth:`~gensim.models.keyedvectors.KeyedVectors.get_vecattr` methods.
Note that all such attributes under the same `attr` name must have compatible `numpy`
types, as the type and storage array for such attributes is established by the 1st time such
`attr` is set.
Parameters
----------
vector_size : int
Intended number of dimensions for all contained vectors.
count : int, optional
If provided, vectors wil be pre-allocated for at least this many vectors. (Otherwise
they can be added later.)
dtype : type, optional
Vector dimensions will default to `np.float32` (AKA `REAL` in some Gensim code) unless
another type is provided here.
mapfile_path : string, optional
Currently unused.
"""
self.vector_size = vector_size
# pre-allocating `index_to_key` to full size helps avoid redundant re-allocations, esp for `expandos`
self.index_to_key = [None] * count # fka index2entity or index2word
self.next_index = 0 # pointer to where next new entry will land
self.key_to_index = {}
self.vectors = zeros((count, vector_size), dtype=dtype) # formerly known as syn0
self.norms = None
# "expandos" are extra attributes stored for each key: {attribute_name} => numpy array of values of
# this attribute, with one array value for each vector key.
# The same information used to be stored in a structure called Vocab in Gensim <4.0.0, but
# with different indexing: {vector key} => Vocab object containing all attributes for the given vector key.
#
# Don't modify expandos directly; call set_vecattr()/get_vecattr() instead.
self.expandos = {}
self.mapfile_path = mapfile_path
def __str__(self):
return f"{self.__class__.__name__}<vector_size={self.vector_size}, {len(self)} keys>"
def _load_specials(self, *args, **kwargs):
"""Handle special requirements of `.load()` protocol, usually up-converting older versions."""
super(KeyedVectors, self)._load_specials(*args, **kwargs)
if hasattr(self, 'doctags'):
self._upconvert_old_d2vkv()
# fixup rename/consolidation into index_to_key of older index2word, index2entity
if not hasattr(self, 'index_to_key'):
self.index_to_key = self.__dict__.pop('index2word', self.__dict__.pop('index2entity', None))
# fixup rename into vectors of older syn0
if not hasattr(self, 'vectors'):
self.vectors = self.__dict__.pop('syn0', None)
self.vector_size = self.vectors.shape[1]
# ensure at least a 'None' in 'norms' to force recalc
if not hasattr(self, 'norms'):
self.norms = None
# ensure at least an empty 'expandos'
if not hasattr(self, 'expandos'):
self.expandos = {}
# fixup rename of vocab into map
if 'key_to_index' not in self.__dict__:
self._upconvert_old_vocab()
# ensure older instances have next_index
if not hasattr(self, 'next_index'):
self.next_index = len(self)
def _upconvert_old_vocab(self):
"""Convert a loaded, pre-gensim-4.0.0 version instance that had a 'vocab' dict of data objects."""
old_vocab = self.__dict__.pop('vocab', None)
self.key_to_index = {}
for k in old_vocab.keys():
old_v = old_vocab[k]
self.key_to_index[k] = old_v.index
for attr in old_v.__dict__.keys():
self.set_vecattr(old_v.index, attr, old_v.__dict__[attr])
# special case to enforce required type on `sample_int`
if 'sample_int' in self.expandos:
self.expandos['sample_int'] = self.expandos['sample_int'].astype(np.uint32)
def allocate_vecattrs(self, attrs=None, types=None):
"""Ensure arrays for given per-vector extra-attribute names & types exist, at right size.
The length of the index_to_key list is canonical 'intended size' of KeyedVectors,
even if other properties (vectors array) hasn't yet been allocated or expanded.
So this allocation targets that size.
"""
# with no arguments, adjust lengths of existing vecattr arrays to match length of index_to_key
if attrs is None:
attrs = list(self.expandos.keys())
types = [self.expandos[attr].dtype for attr in attrs]
target_size = len(self.index_to_key)
for attr, t in zip(attrs, types):
if t is int:
t = np.int64 # ensure 'int' type 64-bit (numpy-on-Windows https://github.com/numpy/numpy/issues/9464)
if t is str:
# Avoid typing numpy arrays as strings, because numpy would use its fixed-width `dtype=np.str_`
# dtype, which uses too much memory!
t = object
if attr not in self.expandos:
self.expandos[attr] = np.zeros(target_size, dtype=t)
continue
prev_expando = self.expandos[attr]
if not np.issubdtype(t, prev_expando.dtype):
raise TypeError(
f"Can't allocate type {t} for attribute {attr}, "
f"conflicts with its existing type {prev_expando.dtype}"
)
if len(prev_expando) == target_size:
continue # no resizing necessary
prev_count = len(prev_expando)
self.expandos[attr] = np.zeros(target_size, dtype=prev_expando.dtype)
self.expandos[attr][: min(prev_count, target_size), ] = prev_expando[: min(prev_count, target_size), ]
def set_vecattr(self, key, attr, val):
"""Set attribute associated with the given key to value.
Parameters
----------
key : str
Store the attribute for this vector key.
attr : str
Name of the additional attribute to store for the given key.
val : object
Value of the additional attribute to store for the given key.
Returns
-------
None
"""
self.allocate_vecattrs(attrs=[attr], types=[type(val)])
index = self.get_index(key)
self.expandos[attr][index] = val
def get_vecattr(self, key, attr):
"""Get attribute value associated with given key.
Parameters
----------
key : str
Vector key for which to fetch the attribute value.
attr : str
Name of the additional attribute to fetch for the given key.
Returns
-------
object
Value of the additional attribute fetched for the given key.
"""
index = self.get_index(key)
return self.expandos[attr][index]
def resize_vectors(self, seed=0):
"""Make underlying vectors match index_to_key size; random-initialize any new rows."""
target_shape = (len(self.index_to_key), self.vector_size)
self.vectors = prep_vectors(target_shape, prior_vectors=self.vectors, seed=seed)
self.allocate_vecattrs()
self.norms = None
def __len__(self):
return len(self.index_to_key)
def __getitem__(self, key_or_keys):
"""Get vector representation of `key_or_keys`.
Parameters
----------
key_or_keys : {str, list of str, int, list of int}
Requested key or list-of-keys.
Returns
-------
numpy.ndarray
Vector representation for `key_or_keys` (1D if `key_or_keys` is single key, otherwise - 2D).
"""
if isinstance(key_or_keys, _KEY_TYPES):
return self.get_vector(key_or_keys)
return vstack([self.get_vector(key) for key in key_or_keys])
def get_index(self, key, default=None):
"""Return the integer index (slot/position) where the given key's vector is stored in the
backing vectors array.
"""
val = self.key_to_index.get(key, -1)
if val >= 0:
return val
elif isinstance(key, (int, np.integer)) and 0 <= key < len(self.index_to_key):
return key
elif default is not None:
return default
else:
raise KeyError(f"Key '{key}' not present")
def get_vector(self, key, norm=False):
"""Get the key's vector, as a 1D numpy array.
Parameters
----------
key : str
Key for vector to return.
norm : bool, optional
If True, the resulting vector will be L2-normalized (unit Euclidean length).
Returns
-------
numpy.ndarray
Vector for the specified key.
Raises
------
KeyError
If the given key doesn't exist.
"""
index = self.get_index(key)
if norm:
self.fill_norms()
result = self.vectors[index] / self.norms[index]
else:
result = self.vectors[index]
result.setflags(write=False) # disallow direct tampering that would invalidate `norms` etc
return result
@deprecated("Use get_vector instead")
def word_vec(self, *args, **kwargs):
"""Compatibility alias for get_vector(); must exist so subclass calls reach subclass get_vector()."""
return self.get_vector(*args, **kwargs)
def get_mean_vector(self, keys, weights=None, pre_normalize=True, post_normalize=False, ignore_missing=True):
"""Get the mean vector for a given list of keys.
Parameters
----------
keys : list of (str or int or ndarray)
Keys specified by string or int ids or numpy array.
weights : list of float or numpy.ndarray, optional
1D array of same size of `keys` specifying the weight for each key.
pre_normalize : bool, optional
Flag indicating whether to normalize each keyvector before taking mean.
If False, individual keyvector will not be normalized.
post_normalize: bool, optional
Flag indicating whether to normalize the final mean vector.
If True, normalized mean vector will be return.
ignore_missing : bool, optional
If False, will raise error if a key doesn't exist in vocabulary.
Returns
-------
numpy.ndarray
Mean vector for the list of keys.
Raises
------
ValueError
If the size of the list of `keys` and `weights` doesn't match.
KeyError
If any of the key doesn't exist in vocabulary and `ignore_missing` is false.
"""
if len(keys) == 0:
raise ValueError("cannot compute mean with no input")
if isinstance(weights, list):
weights = np.array(weights)
if weights is None:
weights = np.ones(len(keys))
if len(keys) != weights.shape[0]: # weights is a 1-D numpy array
raise ValueError(
"keys and weights array must have same number of elements"
)
mean = np.zeros(self.vector_size, self.vectors.dtype)
total_weight = 0
for idx, key in enumerate(keys):
if isinstance(key, ndarray):
mean += weights[idx] * key
total_weight += abs(weights[idx])
elif self.__contains__(key):
vec = self.get_vector(key, norm=pre_normalize)
mean += weights[idx] * vec
total_weight += abs(weights[idx])
elif not ignore_missing:
raise KeyError(f"Key '{key}' not present in vocabulary")
if total_weight > 0:
mean = mean / total_weight
if post_normalize:
mean = matutils.unitvec(mean).astype(REAL)
return mean
def add_vector(self, key, vector):
"""Add one new vector at the given key, into existing slot if available.
Warning: using this repeatedly is inefficient, requiring a full reallocation & copy,
if this instance hasn't been preallocated to be ready for such incremental additions.
Parameters
----------
key: str
Key identifier of the added vector.
vector: numpy.ndarray
1D numpy array with the vector values.
Returns
-------
int
Index of the newly added vector, so that ``self.vectors[result] == vector`` and
``self.index_to_key[result] == key``.
"""
target_index = self.next_index
if target_index >= len(self) or self.index_to_key[target_index] is not None:
# must append at end by expanding existing structures
target_index = len(self)
warnings.warn(
"Adding single vectors to a KeyedVectors which grows by one each time can be costly. "
"Consider adding in batches or preallocating to the required size.",
UserWarning)
self.add_vectors([key], [vector])
self.allocate_vecattrs() # grow any adjunct arrays
self.next_index = target_index + 1
else:
# can add to existing slot
self.index_to_key[target_index] = key
self.key_to_index[key] = target_index
self.vectors[target_index] = vector
self.next_index += 1
return target_index
def add_vectors(self, keys, weights, extras=None, replace=False):
"""Append keys and their vectors in a manual way.
If some key is already in the vocabulary, the old vector is kept unless `replace` flag is True.
Parameters
----------
keys : list of (str or int)
Keys specified by string or int ids.
weights: list of numpy.ndarray or numpy.ndarray
List of 1D np.array vectors or a 2D np.array of vectors.
replace: bool, optional
Flag indicating whether to replace vectors for keys which already exist in the map;
if True - replace vectors, otherwise - keep old vectors.
"""
if isinstance(keys, _KEY_TYPES):
keys = [keys]
weights = np.array(weights).reshape(1, -1)
elif isinstance(weights, list):
weights = np.array(weights)
if extras is None:
extras = {}
# TODO? warn if not matching extras already present?
# initially allocate extras, check type compatibility
self.allocate_vecattrs(extras.keys(), [extras[k].dtype for k in extras.keys()])
in_vocab_mask = np.zeros(len(keys), dtype=bool)
for idx, key in enumerate(keys):
if key in self.key_to_index:
in_vocab_mask[idx] = True
# add new entities to the vocab
for idx in np.nonzero(~in_vocab_mask)[0]:
key = keys[idx]
self.key_to_index[key] = len(self.index_to_key)
self.index_to_key.append(key)
# add vectors, extras for new entities
self.vectors = vstack((self.vectors, weights[~in_vocab_mask].astype(self.vectors.dtype)))
for attr, extra in extras:
self.expandos[attr] = np.vstack((self.expandos[attr], extra[~in_vocab_mask]))
# change vectors, extras for in_vocab entities if `replace` flag is specified
if replace:
in_vocab_idxs = [self.get_index(keys[idx]) for idx in np.nonzero(in_vocab_mask)[0]]
self.vectors[in_vocab_idxs] = weights[in_vocab_mask]
for attr, extra in extras:
self.expandos[attr][in_vocab_idxs] = extra[in_vocab_mask]
def __setitem__(self, keys, weights):
"""Add keys and theirs vectors in a manual way.
If some key is already in the vocabulary, old vector is replaced with the new one.
This method is an alias for :meth:`~gensim.models.keyedvectors.KeyedVectors.add_vectors`
with `replace=True`.
Parameters
----------
keys : {str, int, list of (str or int)}
keys specified by their string or int ids.
weights: list of numpy.ndarray or numpy.ndarray
List of 1D np.array vectors or 2D np.array of vectors.
"""
if not isinstance(keys, list):
keys = [keys]
weights = weights.reshape(1, -1)
self.add_vectors(keys, weights, replace=True)
def has_index_for(self, key):
"""Can this model return a single index for this key?
Subclasses that synthesize vectors for out-of-vocabulary words (like
:class:`~gensim.models.fasttext.FastText`) may respond True for a
simple `word in wv` (`__contains__()`) check but False for this
more-specific check.
"""
return self.get_index(key, -1) >= 0
def __contains__(self, key):
return self.has_index_for(key)
def most_similar_to_given(self, key1, keys_list):
"""Get the `key` from `keys_list` most similar to `key1`."""
return keys_list[argmax([self.similarity(key1, key) for key in keys_list])]
def closer_than(self, key1, key2):
"""Get all keys that are closer to `key1` than `key2` is to `key1`."""
all_distances = self.distances(key1)
e1_index = self.get_index(key1)
e2_index = self.get_index(key2)
closer_node_indices = np.where(all_distances < all_distances[e2_index])[0]
return [self.index_to_key[index] for index in closer_node_indices if index != e1_index]
@deprecated("Use closer_than instead")
def words_closer_than(self, word1, word2):
return self.closer_than(word1, word2)
def rank(self, key1, key2):
"""Rank of the distance of `key2` from `key1`, in relation to distances of all keys from `key1`."""
return len(self.closer_than(key1, key2)) + 1
@property
def vectors_norm(self):
raise AttributeError(
"The `.vectors_norm` attribute is computed dynamically since Gensim 4.0.0. "
"Use `.get_normed_vectors()` instead.\n"
"See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4"
)
@vectors_norm.setter
def vectors_norm(self, _):
pass # ignored but must remain for backward serialization compatibility
def get_normed_vectors(self):
"""Get all embedding vectors normalized to unit L2 length (euclidean), as a 2D numpy array.
To see which key corresponds to which vector = which array row, refer
to the :attr:`~gensim.models.keyedvectors.KeyedVectors.index_to_key` attribute.
Returns
-------
numpy.ndarray:
2D numpy array of shape ``(number_of_keys, embedding dimensionality)``, L2-normalized
along the rows (key vectors).
"""
self.fill_norms()
return self.vectors / self.norms[..., np.newaxis]
def fill_norms(self, force=False):
"""
Ensure per-vector norms are available.
Any code which modifies vectors should ensure the accompanying norms are
either recalculated or 'None', to trigger a full recalculation later on-request.
"""
if self.norms is None or force:
self.norms = np.linalg.norm(self.vectors, axis=1)
@property
def index2entity(self):
raise AttributeError(
"The index2entity attribute has been replaced by index_to_key since Gensim 4.0.0.\n"
"See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4"
)
@index2entity.setter
def index2entity(self, value):
self.index_to_key = value # must remain for backward serialization compatibility
@property
def index2word(self):
raise AttributeError(
"The index2word attribute has been replaced by index_to_key since Gensim 4.0.0.\n"
"See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4"
)
@index2word.setter
def index2word(self, value):
self.index_to_key = value # must remain for backward serialization compatibility
@property
def vocab(self):
raise AttributeError(
"The vocab attribute was removed from KeyedVector in Gensim 4.0.0.\n"
"Use KeyedVector's .key_to_index dict, .index_to_key list, and methods "
".get_vecattr(key, attr) and .set_vecattr(key, attr, new_val) instead.\n"
"See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4"
)
@vocab.setter
def vocab(self, value):
self.vocab() # trigger above NotImplementedError
def sort_by_descending_frequency(self):
"""Sort the vocabulary so the most frequent words have the lowest indexes."""
if not len(self):
return # noop if empty
count_sorted_indexes = np.argsort(self.expandos['count'])[::-1]
self.index_to_key = [self.index_to_key[idx] for idx in count_sorted_indexes]
self.allocate_vecattrs()
for k in self.expandos:
# Use numpy's "fancy indexing" to permutate the entire array in one step.
self.expandos[k] = self.expandos[k][count_sorted_indexes]
if len(self.vectors):
logger.warning("sorting after vectors have been allocated is expensive & error-prone")
self.vectors = self.vectors[count_sorted_indexes]
self.key_to_index = {word: i for i, word in enumerate(self.index_to_key)}
def save(self, *args, **kwargs):
"""Save KeyedVectors to a file.
Parameters
----------
fname : str
Path to the output file.
See Also
--------
:meth:`~gensim.models.keyedvectors.KeyedVectors.load`
Load a previously saved model.
"""
super(KeyedVectors, self).save(*args, **kwargs)
def most_similar(
self, positive=None, negative=None, topn=10, clip_start=0, clip_end=None,
restrict_vocab=None, indexer=None,
):
"""Find the top-N most similar keys.
Positive keys contribute positively towards the similarity, negative keys negatively.
This method computes cosine similarity between a simple mean of the projection
weight vectors of the given keys and the vectors for each key in the model.
The method corresponds to the `word-analogy` and `distance` scripts in the original
word2vec implementation.
Parameters
----------
positive : list of (str or int or ndarray) or list of ((str,float) or (int,float) or (ndarray,float)), optional
List of keys that contribute positively. If tuple, second element specifies the weight (default `1.0`)
negative : list of (str or int or ndarray) or list of ((str,float) or (int,float) or (ndarray,float)), optional
List of keys that contribute negatively. If tuple, second element specifies the weight (default `-1.0`)
topn : int or None, optional
Number of top-N similar keys to return, when `topn` is int. When `topn` is None,
then similarities for all keys are returned.
clip_start : int
Start clipping index.
clip_end : int
End clipping index.
restrict_vocab : int, optional
Optional integer which limits the range of vectors which
are searched for most-similar values. For example, restrict_vocab=10000 would
only check the first 10000 key vectors in the vocabulary order. (This may be
meaningful if you've sorted the vocabulary by descending frequency.) If
specified, overrides any values of ``clip_start`` or ``clip_end``.
Returns
-------
list of (str, float) or numpy.array
When `topn` is int, a sequence of (key, similarity) is returned.
When `topn` is None, then similarities for all keys are returned as a
one-dimensional numpy array with the size of the vocabulary.
"""
if isinstance(topn, Integral) and topn < 1:
return []
# allow passing a single string-key or vector for the positive/negative arguments
positive = _ensure_list(positive)
negative = _ensure_list(negative)
self.fill_norms()
clip_end = clip_end or len(self.vectors)
if restrict_vocab:
clip_start = 0
clip_end = restrict_vocab
# add weights for each key, if not already present; default to 1.0 for positive and -1.0 for negative keys
keys = []
weight = np.concatenate((np.ones(len(positive)), -1.0 * np.ones(len(negative))))
for idx, item in enumerate(positive + negative):
if isinstance(item, _EXTENDED_KEY_TYPES):
keys.append(item)
else:
keys.append(item[0])
weight[idx] = item[1]
# compute the weighted average of all keys
mean = self.get_mean_vector(keys, weight, pre_normalize=True, post_normalize=True, ignore_missing=False)
all_keys = [
self.get_index(key) for key in keys if isinstance(key, _KEY_TYPES) and self.has_index_for(key)
]
if indexer is not None and isinstance(topn, int):
return indexer.most_similar(mean, topn)
dists = dot(self.vectors[clip_start:clip_end], mean) / self.norms[clip_start:clip_end]
if not topn:
return dists
best = matutils.argsort(dists, topn=topn + len(all_keys), reverse=True)
# ignore (don't return) keys from the input
result = [
(self.index_to_key[sim + clip_start], float(dists[sim]))
for sim in best if (sim + clip_start) not in all_keys
]
return result[:topn]
def similar_by_word(self, word, topn=10, restrict_vocab=None):
"""Compatibility alias for similar_by_key()."""
return self.similar_by_key(word, topn, restrict_vocab)
def similar_by_key(self, key, topn=10, restrict_vocab=None):
"""Find the top-N most similar keys.
Parameters
----------
key : str
Key
topn : int or None, optional
Number of top-N similar keys to return. If topn is None, similar_by_key returns
the vector of similarity scores.
restrict_vocab : int, optional
Optional integer which limits the range of vectors which
are searched for most-similar values. For example, restrict_vocab=10000 would
only check the first 10000 key vectors in the vocabulary order. (This may be
meaningful if you've sorted the vocabulary by descending frequency.)
Returns
-------
list of (str, float) or numpy.array
When `topn` is int, a sequence of (key, similarity) is returned.
When `topn` is None, then similarities for all keys are returned as a
one-dimensional numpy array with the size of the vocabulary.
"""
return self.most_similar(positive=[key], topn=topn, restrict_vocab=restrict_vocab)
def similar_by_vector(self, vector, topn=10, restrict_vocab=None):
"""Find the top-N most similar keys by vector.
Parameters
----------
vector : numpy.array
Vector from which similarities are to be computed.
topn : int or None, optional
Number of top-N similar keys to return, when `topn` is int. When `topn` is None,
then similarities for all keys are returned.
restrict_vocab : int, optional
Optional integer which limits the range of vectors which
are searched for most-similar values. For example, restrict_vocab=10000 would
only check the first 10000 key vectors in the vocabulary order. (This may be
meaningful if you've sorted the vocabulary by descending frequency.)
Returns
-------
list of (str, float) or numpy.array
When `topn` is int, a sequence of (key, similarity) is returned.
When `topn` is None, then similarities for all keys are returned as a
one-dimensional numpy array with the size of the vocabulary.
"""
return self.most_similar(positive=[vector], topn=topn, restrict_vocab=restrict_vocab)
def wmdistance(self, document1, document2, norm=True):
"""Compute the Word Mover's Distance between two documents.
When using this code, please consider citing the following papers:
* `Rémi Flamary et al. "POT: Python Optimal Transport"
<https://jmlr.org/papers/v22/20-451.html>`_
* `Matt Kusner et al. "From Word Embeddings To Document Distances"
<http://proceedings.mlr.press/v37/kusnerb15.pdf>`_.
Parameters
----------
document1 : list of str
Input document.
document2 : list of str
Input document.
norm : boolean
Normalize all word vectors to unit length before computing the distance?
Defaults to True.
Returns
-------
float
Word Mover's distance between `document1` and `document2`.
Warnings
--------
This method only works if `POT <https://pypi.org/project/POT/>`_ is installed.
If one of the documents have no words that exist in the vocab, `float('inf')` (i.e. infinity)
will be returned.
Raises
------
ImportError
If `POT <https://pypi.org/project/POT/>`_ isn't installed.
"""
# If POT is attempted to be used, but isn't installed, ImportError will be raised in wmdistance
from ot import emd2
# Remove out-of-vocabulary words.
len_pre_oov1 = len(document1)
len_pre_oov2 = len(document2)
document1 = [token for token in document1 if token in self]
document2 = [token for token in document2 if token in self]
diff1 = len_pre_oov1 - len(document1)
diff2 = len_pre_oov2 - len(document2)
if diff1 > 0 or diff2 > 0:
logger.info('Removed %d and %d OOV words from document 1 and 2 (respectively).', diff1, diff2)
if not document1 or not document2:
logger.warning("At least one of the documents had no words that were in the vocabulary.")
return float('inf')
dictionary = Dictionary(documents=[document1, document2])
vocab_len = len(dictionary)
if vocab_len == 1:
# Both documents are composed of a single unique token => zero distance.
return 0.0
doclist1 = list(set(document1))
doclist2 = list(set(document2))
v1 = np.array([self.get_vector(token, norm=norm) for token in doclist1])
v2 = np.array([self.get_vector(token, norm=norm) for token in doclist2])
doc1_indices = dictionary.doc2idx(doclist1)
doc2_indices = dictionary.doc2idx(doclist2)
# Compute distance matrix.
distance_matrix = zeros((vocab_len, vocab_len), dtype=double)
distance_matrix[np.ix_(doc1_indices, doc2_indices)] = cdist(v1, v2)
if abs(np_sum(distance_matrix)) < 1e-8:
# `emd` gets stuck if the distance matrix contains only zeros.
logger.info('The distance matrix is all zeros. Aborting (returning inf).')
return float('inf')
def nbow(document):
d = zeros(vocab_len, dtype=double)
nbow = dictionary.doc2bow(document) # Word frequencies.
doc_len = len(document)
for idx, freq in nbow:
d[idx] = freq / float(doc_len) # Normalized word frequencies.
return d