import numpy as np
from sklearn.preprocessing import normalize
import requests
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
%load_ext autoreload
%autoreload 2


class WikiEmbedding:

    def __init__(self, fname):
        
        self.w2idx = {}
        self.idx2w = []
    
        with open(fname, 'rb') as f:
            next(f)
            m,n = 1828514, 100
            self.E = np.zeros((int(m), int(n)))

            for i, l in enumerate(f):
                try:
                    l = l.decode('utf8').strip().split(' ')
                except:
                    continue
                w = l[0]
                self.E[i] = np.array(l[1:])
                self.w2idx[w] = i
                self.idx2w.append(w)
                
        self.E = normalize(self.E)
        self.idx2w = np.array(self.idx2w)

    def most_similar(self, w, n=10, min_similarity=0.5):
        """
        Find the top-N most similar words to w, based on cosine similarity.
        As a speed optimization, only consider neighbors with a similarity
        above min_similarity
        """
        
        if type(w) is str:
            w = self.E[self.w2idx[w]]

        scores = self.E.dot(w)
        # only consider neighbors above threshold
        min_idxs = np.where(scores > min_similarity)
        ranking = np.argsort(-scores[min_idxs])[1:(n+1)]
        nn_ws = self.idx2w[min_idxs][ranking]
        nn_scores = scores[min_idxs][ranking]
        return list(zip(list(nn_ws), list(nn_scores)))
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
en_embedding = WikiEmbedding('2017-01-01_2017-01-30_en_100')
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-6-b0aa5fcb03e7> in <module>()
----> 1 en_embedding = WikiEmbedding('2017-01-01_2017-01-30_en_100')

<ipython-input-5-c87c2313e6c5> in __init__(self, fname)
     29                     continue
     30                 w = l[0]
---> 31                 self.E[i] = np.array(l[1:])
     32                 self.w2idx[w] = i
     33                 self.idx2w.append(w)

ValueError: could not broadcast input array from shape (99) into shape (100)
en_embedding.most_similar('Word2vec')