Source code for Data_Production.compare_vectors

__author__ = 'matt'

import numpy as np
import pandas as pd
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import scale
from itertools import combinations
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
import argparse
import os


[docs]class comparison: """ Compares the vectors of a single word across the data from several different corpora. Note that the data from the different corpora must be normalized, preferably using sklearn.preprocessing.scale. :param base: the directory containing the sub-directories that contain the data for the different corpora :type base: str :param english: the english transcription of the word being analyzed (used only in file naming) :type english: str :param greek: the word in the alphabet of the target language. It must be written exactly as it is present in the corpora! :type greek: str :param measure: the type of data to use in the comparison, cosine similarity (CS), log-likelihood (LL), positive pointwise mutual information (PPMI), or raw co-occurrence counts (cooc) :type measure: str :param norm: whether the data needs to be normalized :type norm: bool :ivar corpora: the parameter information for each corpus to be used. Each corpus is represented by a tuple that contains the following information: **Corpus Name** *(str)*: should match the name of the parent folder in which the text files for the corpus are kept, **Best window size** *(str)*: the size of the context window as determined by ParamTester, **Minimum occurrences** *(int)*: the minimum number of times a word had to occur in your corpus before being used to produce data in SemPipeline, **Weighted or Unweighted window type** *(bool)*: whether the data for that corpus was produced using a weighted (True) or unweighted (False) context window type :type corpora: [(str, str, int, bool)] :ivar base: passed on from the ``base`` parameter :type base: str :ivar ekk_rows: an empty dictionary that will contain the vectors for each corpus :type ekk_rows: dict :ivar english: passed on from the ``english`` parameter :type english: str :ivar greek: passed on from the ``greek`` parameter :ivar prefix: part of the naming convention for the files from which the vectors will be extracted. Determined by the ``measure`` parameter :type prefix: str :ivar svd: part of the naming convention for the files from which the vectors will be extracted. Determined by the ``measure`` parameter :type svd: str :ivar norm: passed on from the ``norm`` parameter :type norm: bool """ def __init__(self, base, english, greek, measure, norm=False, zscore=1.0, **kwargs): self.corpora = [('NT', '16', 1, True), ('LXX', '13', 1, True), ('philo', '26', 1, False), ('josephus', '35', 1, False), ('plutarch', '49', 1, False), ('pers_data', '51', 1, False)] self.base = base self.ekk_rows = {} self.english = english self.greek = greek if measure == 'CS': self.prefix = 'LL_cosine' self.svd = '' elif measure == 'LL': self.prefix = 'LL' self.svd = '' elif measure == 'cooc': self.prefix = 'COOC' self.svd = '' elif measure == 'PPMI': self.prefix = 'PPMI' self.svd = '' else: print('"measure" must be "CS", "LL", "PPMI", or "cooc"') self.norm = norm self.zscore = zscore def load_vectors(self): """ Loads the appropriate word vector from each corpus in self.corpora """ for corp in self.corpora: rows = pd.read_pickle( '{0}{1}/{4}/{2}/{4}_IndexList_w={2}_lems=False_min_occs={3}_no_stops=False.pickle'.format( self.base, corp[0], corp[1], corp[2], self.english)) i = rows.index(self.greek) if self.norm: os.system('echo Now normalizing {}'.format(corp[0])) orig = np.memmap( '{0}{1}/{4}/{2}/{5}_{2}_lems=False_{4}_min_occ={3}_{6}no_stops=False_weighted={7}.dat'.format( self.base, corp[0], corp[1], corp[2], self.english, self.prefix, self.svd, corp[3]), dtype='float', shape=(len(rows), len(rows))) normed = np.memmap( '{0}{1}/{4}/{2}/{5}_{2}_lems=False_{4}_min_occ={3}_{6}no_stops=False_weighted={7}_NORMED.dat'.format( self.base, corp[0], corp[1], corp[2], self.english, self.prefix, self.svd, corp[3]), dtype='float', mode='w+', shape=(len(rows), len(rows))) normed[:] = scale(orig) r = normed[i] del normed del orig else: r = np.memmap( '{0}{1}/{4}/{2}/{5}_{2}_lems=False_{4}_min_occ={3}_{6}no_stops=False_weighted={7}_NORMED.dat'.format( self.base, corp[0], corp[1], corp[2], self.english, self.prefix, self.svd, corp[3]), dtype='float', shape=(len(rows), len(rows)))[i] self.ekk_rows[corp[0]] = pd.Series(r, index=rows) def sim_calc(self): """ Calculates the similarity for each vector with the others based on the words that the corpora share """ self.cs_scores = pd.DataFrame(index=self.ekk_rows.keys(), columns=self.ekk_rows.keys()) for combo in combinations(self.ekk_rows.keys(), 2): ekk_index = list(set(self.ekk_rows[combo[0]].index).intersection( set(self.ekk_rows[combo[1]].index))) self.cs_scores.ix[combo[0], combo[1]] = self.cs_scores.ix[ combo[1], combo[0]] = (1 - pairwise_distances( self.ekk_rows[combo[0]][ekk_index], self.ekk_rows[combo[1]][ekk_index], metric='cosine'))[0][0] top_100 = abs(self.ekk_rows[combo[0]][self.ekk_rows[combo[0]] > self.zscore] - self.ekk_rows[combo[1]][self.ekk_rows[combo[1]] > self.zscore]).order().head(100) top_100.to_csv('{}/{}_{}_zscore={}_top_100_words.txt'.format(self.base, combo[0], combo[1], self.zscore), sep='\t') self.cs_scores = self.cs_scores.fillna(1) def graph_it(self): """ Graphs the results on a bar graph """ fig, ax = plt.subplots() index = np.arange(len(self.cs_scores))*1.2 bar_width = 0.15 opacity = 0.4 #error_config = {'ecolor': '0.3'} mult = 0 for corp in self.cs_scores: rects = plt.bar(index + bar_width * mult, self.cs_scores.ix[corp], bar_width, color='.9', label=corp) for i, rect in enumerate(rects): height = rect.get_height() if corp.islower(): name = corp.title() else: name = corp ax.text(rect.get_x() + rect.get_width() / 2., height / 2, name, size='small', rotation='vertical', ha='center', va='bottom') if height != 1: ax.text(rect.get_x() + rect.get_width() / 2., height + .01, round(height, 2), size='small', rotation='vertical', ha='center', va='bottom') mult += 1 plt.xlabel('Corpus') plt.ylabel('CS Score') plt.title('CS comparison of word vectors') plt.xticks(index + 3 * bar_width, [x for x in self.cs_scores]) plt.savefig('{}/{}_CS_corps_compare.png'.format(self.base, self.english), dpi=500)
class matrix_comparison(comparison): def load_vectors(self): for corp in self.corpora: self.ekk_rows[corp[0]] = pd.read_pickle( '{0}{1}/{4}/{2}/{4}_IndexList_w={2}_lems=False_min_occs={3}_no_stops=False.pickle'.format( self.base, corp[0], corp[1], corp[2], self.english)) def sim_calc(self): nt = self.corpora[0] self.scores = {} for corp in self.corpora: i_nt = [] i_c2 = [] rows = self.ekk_rows[corp[0]] for i, word in enumerate(self.ekk_rows['NT']): if word in rows: i_nt.append(i) i_c2.append(self.ekk_rows[corp[0]].index(word)) d_c2 = np.memmap( '{0}{1}/{4}/{2}/{5}_{2}_lems=False_{4}_min_occ={3}_{6}no_stops=False_NORMED.dat'.format( self.base, corp[0], corp[1], corp[2], self.english, self.prefix, self.svd), dtype='float32', shape=(len(rows), len(rows)))[i_c2] d_c2 = d_c2[:, i_c2] d_nt = np.memmap( '{0}{1}/{4}/{2}/{5}_{2}_lems=False_{4}_min_occ={3}_{6}no_stops=False_NORMED.dat'.format( self.base, nt[0], nt[1], nt[2], self.english, self.prefix, self.svd), dtype='float32', shape=(len(self.ekk_rows['NT']), len(self.ekk_rows['NT'])))[ i_nt] d_nt = d_nt[:, i_nt] self.scores['{0}_{1}'.format('NT', corp[0])] = np.average(np.diag( 1 - pairwise_distances(d_nt, d_c2, metric='cosine', n_jobs=12))) def cmd(): # base, english, greek, measure, norm=False parser = argparse.ArgumentParser(description='Compares distributional data across corpora.') parser.add_argument('--base', type=str, default='./', help='The file path for the parent folder in which all the corpora sub-folders are located') parser.add_argument('--english', type=str, help='The transliteration into Latin characters for the word under investigation') parser.add_argument('--greek', type=str, help='The word under investigation in its native alphabet') parser.add_argument('--measure', type=str, default='CS', choices=['CS', 'LL', 'PPMI', 'cooc'], help='The type of data to be used for the comparison') parser.add_argument('--norm', dest='norm', action='store_true', help='Whether to run data normalization on the input matrices (should be True if the data has not yet been normalized') parser.add_argument('--zscore', type=float, help='The minimum Z-score for words represented in the resulting top-100 lists ') parser.set_defaults(func=comparison, norm=False) args = parser.parse_args() pipe = args.func(**vars(args)) pipe.load_vectors() pipe.sim_calc() pipe.graph_it() if __name__ == '__main__': cmd()