__author__ = 'matt'
import numpy as np
import pandas as pd
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import scale
from itertools import combinations
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
import argparse
import os
[docs]class comparison:
""" Compares the vectors of a single word across the data from several different corpora.
Note that the data from the different corpora must be normalized, preferably using sklearn.preprocessing.scale.
:param base: the directory containing the sub-directories that contain the data for the different corpora
:type base: str
:param english: the english transcription of the word being analyzed (used only in file naming)
:type english: str
:param greek: the word in the alphabet of the target language. It must be written exactly as it is present in the corpora!
:type greek: str
:param measure: the type of data to use in the comparison, cosine similarity (CS), log-likelihood (LL), positive pointwise mutual information (PPMI), or raw co-occurrence counts (cooc)
:type measure: str
:param norm: whether the data needs to be normalized
:type norm: bool
:ivar corpora: the parameter information for each corpus to be used. Each corpus is represented by a tuple that contains the following information:
**Corpus Name** *(str)*: should match the name of the parent folder in which the text files for the corpus are kept,
**Best window size** *(str)*: the size of the context window as determined by ParamTester,
**Minimum occurrences** *(int)*: the minimum number of times a word had to occur in your corpus before being used to produce data in SemPipeline,
**Weighted or Unweighted window type** *(bool)*: whether the data for that corpus was produced using a weighted (True) or unweighted (False) context window type
:type corpora: [(str, str, int, bool)]
:ivar base: passed on from the ``base`` parameter
:type base: str
:ivar ekk_rows: an empty dictionary that will contain the vectors for each corpus
:type ekk_rows: dict
:ivar english: passed on from the ``english`` parameter
:type english: str
:ivar greek: passed on from the ``greek`` parameter
:ivar prefix: part of the naming convention for the files from which the vectors will be extracted. Determined by the ``measure`` parameter
:type prefix: str
:ivar svd: part of the naming convention for the files from which the vectors will be extracted. Determined by the ``measure`` parameter
:type svd: str
:ivar norm: passed on from the ``norm`` parameter
:type norm: bool
"""
def __init__(self, base, english, greek, measure, norm=False, zscore=1.0, **kwargs):
self.corpora = [('NT', '16', 1, True), ('LXX', '13', 1, True),
('philo', '26', 1, False), ('josephus', '35', 1, False),
('plutarch', '49', 1, False), ('pers_data', '51', 1, False)]
self.base = base
self.ekk_rows = {}
self.english = english
self.greek = greek
if measure == 'CS':
self.prefix = 'LL_cosine'
self.svd = ''
elif measure == 'LL':
self.prefix = 'LL'
self.svd = ''
elif measure == 'cooc':
self.prefix = 'COOC'
self.svd = ''
elif measure == 'PPMI':
self.prefix = 'PPMI'
self.svd = ''
else:
print('"measure" must be "CS", "LL", "PPMI", or "cooc"')
self.norm = norm
self.zscore = zscore
def load_vectors(self):
""" Loads the appropriate word vector from each corpus in self.corpora
"""
for corp in self.corpora:
rows = pd.read_pickle(
'{0}{1}/{4}/{2}/{4}_IndexList_w={2}_lems=False_min_occs={3}_no_stops=False.pickle'.format(
self.base, corp[0], corp[1], corp[2], self.english))
i = rows.index(self.greek)
if self.norm:
os.system('echo Now normalizing {}'.format(corp[0]))
orig = np.memmap(
'{0}{1}/{4}/{2}/{5}_{2}_lems=False_{4}_min_occ={3}_{6}no_stops=False_weighted={7}.dat'.format(
self.base, corp[0], corp[1], corp[2], self.english, self.prefix, self.svd, corp[3]),
dtype='float', shape=(len(rows), len(rows)))
normed = np.memmap(
'{0}{1}/{4}/{2}/{5}_{2}_lems=False_{4}_min_occ={3}_{6}no_stops=False_weighted={7}_NORMED.dat'.format(
self.base, corp[0], corp[1], corp[2], self.english, self.prefix, self.svd, corp[3]),
dtype='float', mode='w+', shape=(len(rows), len(rows)))
normed[:] = scale(orig)
r = normed[i]
del normed
del orig
else:
r = np.memmap(
'{0}{1}/{4}/{2}/{5}_{2}_lems=False_{4}_min_occ={3}_{6}no_stops=False_weighted={7}_NORMED.dat'.format(
self.base, corp[0], corp[1], corp[2], self.english,
self.prefix, self.svd, corp[3]), dtype='float',
shape=(len(rows), len(rows)))[i]
self.ekk_rows[corp[0]] = pd.Series(r, index=rows)
def sim_calc(self):
""" Calculates the similarity for each vector with the others based on the words that the corpora share
"""
self.cs_scores = pd.DataFrame(index=self.ekk_rows.keys(),
columns=self.ekk_rows.keys())
for combo in combinations(self.ekk_rows.keys(), 2):
ekk_index = list(set(self.ekk_rows[combo[0]].index).intersection(
set(self.ekk_rows[combo[1]].index)))
self.cs_scores.ix[combo[0], combo[1]] = self.cs_scores.ix[
combo[1], combo[0]] = (1 - pairwise_distances(
self.ekk_rows[combo[0]][ekk_index],
self.ekk_rows[combo[1]][ekk_index], metric='cosine'))[0][0]
top_100 = abs(self.ekk_rows[combo[0]][self.ekk_rows[combo[0]] > self.zscore] - self.ekk_rows[combo[1]][self.ekk_rows[combo[1]] > self.zscore]).order().head(100)
top_100.to_csv('{}/{}_{}_zscore={}_top_100_words.txt'.format(self.base, combo[0], combo[1], self.zscore), sep='\t')
self.cs_scores = self.cs_scores.fillna(1)
def graph_it(self):
""" Graphs the results on a bar graph
"""
fig, ax = plt.subplots()
index = np.arange(len(self.cs_scores))*1.2
bar_width = 0.15
opacity = 0.4
#error_config = {'ecolor': '0.3'}
mult = 0
for corp in self.cs_scores:
rects = plt.bar(index + bar_width * mult, self.cs_scores.ix[corp], bar_width, color='.9', label=corp)
for i, rect in enumerate(rects):
height = rect.get_height()
if corp.islower():
name = corp.title()
else:
name = corp
ax.text(rect.get_x() + rect.get_width() / 2., height / 2, name, size='small', rotation='vertical', ha='center', va='bottom')
if height != 1:
ax.text(rect.get_x() + rect.get_width() / 2., height + .01, round(height, 2), size='small', rotation='vertical', ha='center', va='bottom')
mult += 1
plt.xlabel('Corpus')
plt.ylabel('CS Score')
plt.title('CS comparison of word vectors')
plt.xticks(index + 3 * bar_width, [x for x in self.cs_scores])
plt.savefig('{}/{}_CS_corps_compare.png'.format(self.base, self.english), dpi=500)
class matrix_comparison(comparison):
def load_vectors(self):
for corp in self.corpora:
self.ekk_rows[corp[0]] = pd.read_pickle(
'{0}{1}/{4}/{2}/{4}_IndexList_w={2}_lems=False_min_occs={3}_no_stops=False.pickle'.format(
self.base, corp[0], corp[1], corp[2], self.english))
def sim_calc(self):
nt = self.corpora[0]
self.scores = {}
for corp in self.corpora:
i_nt = []
i_c2 = []
rows = self.ekk_rows[corp[0]]
for i, word in enumerate(self.ekk_rows['NT']):
if word in rows:
i_nt.append(i)
i_c2.append(self.ekk_rows[corp[0]].index(word))
d_c2 = np.memmap(
'{0}{1}/{4}/{2}/{5}_{2}_lems=False_{4}_min_occ={3}_{6}no_stops=False_NORMED.dat'.format(
self.base, corp[0], corp[1], corp[2], self.english, self.prefix, self.svd),
dtype='float32', shape=(len(rows), len(rows)))[i_c2]
d_c2 = d_c2[:, i_c2]
d_nt = np.memmap(
'{0}{1}/{4}/{2}/{5}_{2}_lems=False_{4}_min_occ={3}_{6}no_stops=False_NORMED.dat'.format(
self.base, nt[0], nt[1], nt[2], self.english, self.prefix,
self.svd), dtype='float32',
shape=(len(self.ekk_rows['NT']), len(self.ekk_rows['NT'])))[
i_nt]
d_nt = d_nt[:, i_nt]
self.scores['{0}_{1}'.format('NT', corp[0])] = np.average(np.diag(
1 - pairwise_distances(d_nt, d_c2, metric='cosine',
n_jobs=12)))
def cmd():
# base, english, greek, measure, norm=False
parser = argparse.ArgumentParser(description='Compares distributional data across corpora.')
parser.add_argument('--base', type=str, default='./', help='The file path for the parent folder in which all the corpora sub-folders are located')
parser.add_argument('--english', type=str, help='The transliteration into Latin characters for the word under investigation')
parser.add_argument('--greek', type=str, help='The word under investigation in its native alphabet')
parser.add_argument('--measure', type=str, default='CS', choices=['CS', 'LL', 'PPMI', 'cooc'], help='The type of data to be used for the comparison')
parser.add_argument('--norm', dest='norm', action='store_true', help='Whether to run data normalization on the input matrices (should be True if the data has not yet been normalized')
parser.add_argument('--zscore', type=float, help='The minimum Z-score for words represented in the resulting top-100 lists ')
parser.set_defaults(func=comparison, norm=False)
args = parser.parse_args()
pipe = args.func(**vars(args))
pipe.load_vectors()
pipe.sim_calc()
pipe.graph_it()
if __name__ == '__main__':
cmd()