__author__ = 'matt'
import pandas as pd
import numpy as np
import sys
from pickle import dump
try:
from Data_Production.TK_files import tk_control
except ImportError:
print(
'Tkinter cannot be used on this Python installation.\nPlease designate a list of files in the files variable.')
sys.setrecursionlimit(50000)
# word_cats = (('θεός', 12), ('ἔθνος', 11), ('λατρεύω', 53),
# ('βασιλεύς', 37), ('ἡγέομαι', 36), ('γινώσκω', 28))
class CatSim:
def __init__(self):
"""
Calculates similarity of words within Louw-Nida semantic subdomains
based on pre-calculated matrices of similarity results. See sub-classes
for more information
:return:
:rtype:
"""
try:
self.ln = pd.read_pickle('Data/Chapter_2/LN_Cat_Dict.pickle')
except FileNotFoundError:
ln_file = tk_control(
"askopenfilename(title='Where is your Louw-Nida dictionary pickle?')")
self.ln = pd.read_pickle(ln_file)
self.scores = {}
self.averages = {}
self.ave_no_93 = {}
self.good_words = []
self.prob_words = []
self.prob_word_replace = {'περιΐστημι': 'περιΐστημι',
'προΐστημι': 'προΐστημι',
'παρατεινω': 'παρατείνω',
'μήπως': '',
'ταβέρνη': 'Ταβέρνη',
'ἀναμάρτητος': '',
'προσεγγίζω': '',
'ηλι': 'ἠλί',
'δανειστής': 'δανιστής',
'κατακύπτω': '',
'πρωΐ': 'πρωΐ',
'τυρβάζω': '',
'Θυάτιρα': 'Θυάτειρα',
'τετράπουν': 'τετράπους',
'ἰουδαΐζω': 'ἰουδαΐζω',
'Τρωγύλλιον': '',
'σεβαστός': 'Σεβαστός',
'σωτήριον': 'σωτήριος',
'κακοπάθεια': 'κακοπαθία',
'προσανατίθεμαι': 'προσανατίθημι',
'Χερούβ': 'Χεροῦβ',
'πρωΐα': 'πρωΐα',
'Σαλίμ': 'Σαλείμ',
'νόσημα': '',
'διΐστημι': 'διΐστημι',
'Νικολαΐτης': 'Νικολαΐτης',
'αὐτόφωρος': '',
'ὅμιλος': '',
'μαρανα': [],
'Ματθάτ': 'Ματθάν',
'ὀρεινή': 'ὀρεινός',
'δευτερόπρωτος': '',
'ἔξεστι': 'ἔξεστι(ν)',
'τομός': 'τομώτερος',
'πειραω': 'πειράω',
'ῥυπαίνομαι': 'ῥυπαρεύω',
'Λεββαιος': '',
'ελωι': 'ἐλωΐ',
'πραΰς': 'πραΰς',
'ὑπερβαίνω': '',
'ὅποτε': 'ὁπότε',
'ἐνεός': 'ἐνέος',
'τεσσαρακονταετής': 'τεσσερακονταετής',
'ἀγαθοποιΐα': 'ἀγαθοποιΐα',
'Σάπφειρα': 'Σάπφιρα',
'σαβαχθανι': 'σαβαχθάνι',
'στοιχεῖα': 'στοιχεῖον',
'συζήτησις': '',
'ἐκτίθεμαι': 'ἐκτίθημι',
'Κλῆμης': 'Κλήμης',
'τάραχή': 'τάραχος',
'Πάρθοι': 'Πάρθος',
'λεμα': 'λεμά',
'Λωΐς': 'Λωΐς',
'Βηθζαθά': 'Βηθεσδά',
'πίμπραμαι': 'πίμπρημι',
'Σεμεΐν': 'Σεμεΐν',
'Νεάπολις': '',
'Νύμφας': 'Νύμφα',
'πλείων': 'πολύς',
'Πτολεμαΐς': 'Πτολεμαΐς',
'Βηθαβαρα': '',
'Ῥωμαικος': '',
'ἐκλανθάνομαι': 'ἐκλανθάνω',
'Βενιαμείν': 'Βενιαμίν',
'Ἰουνιᾶς': 'Ἰουνία',
'πραΰτης': 'πραΰτης',
'ῥεδή': 'ῥέδη',
'καταβιβαζω': 'καταβαίνω',
'σπόριμα': 'σπόριμος',
'Ἀχαΐα': 'Ἀχαΐα',
'εὐποιΐα': 'εὐποιΐα',
'ἀνάτεμα': 'ἀνάθεμα',
'ὀσφῦς': 'ὀσφύς',
'κηριον': '',
'Ναΐν': 'Ναΐν',
'θα': 'θά',
'δήποτε': '',
'θρῆνος': '',
'Ἠσαΐας': 'Ἠσαΐας',
'καταγράφω': '',
'Καλοι Λιμένης': '',
'πραϋπάθεια': 'πραϋπαθία',
'τεσσαράκοντα': 'τεσσεράκοντα',
'μελισσιος': '',
'Γεργεσηνός': 'Γερασηνός',
'Ἑβραικός': '',
'ἅλς': '',
'Φάρες': 'Φαρές',
'λόγια': 'λογεία',
'ἀΐδιος': 'ἀΐδιος',
'ἀγραύλεω': 'ἀγραυλέω',
'νεομηνία': 'νουμηνία',
'Ἑβραΐς': 'Ἑβραΐς',
'Ἀβραάμ': 'Ἀβραάμ'.lower()}
def LoadDF(self, w):
raise NotImplementedError('LoadDF is not implemented on CatSim. '
'Instead, use a sub-class (CatSimWin or '
'CatSimSVD).')
def SimCalc(self, w):
"""
loops through each Louw-Nida sub-domain and calculates the individual
similarity of each word with the other words.
:param w: the window size being calculated
:type w: int
:return: self.scores - a dictionary with keys for each window size,
sub-keys for each Louw-Nida sub-domain, and values as Pandas
DataFrames with the scores for the words in each sub-domain with
each other
:rtype: dict
"""
self.scores[w] = {}
try:
mean, std = np.mean(self.df.values), np.std(self.df.values)
except AttributeError:
mean, std = np.mean(self.df), np.std(self.df)
print('%s average: %s, std: %s' % (w, mean, std))
self.tot_words = 0
self.words_no_93 = 0
self.not_words = 0
for cat in self.ln.keys():
words = []
for d in self.ln[cat]['words']:
word = list(d.keys())[0]
if word.lower() in self.ind:
words.append((word.lower(), d[word]))
self.tot_words += 1
self.good_words.append(word)
if cat[0] != 93:
self.words_no_93 += 1
else:
try:
new_word = self.prob_word_replace[word]
if new_word != '':
words.append((new_word.lower(), d[word]))
self.tot_words += 1
self.good_words.append(w)
if cat[0] != 93:
self.words_no_93 += 1
self.prob_words.append(word)
self.not_words += 1
except KeyError:
continue
words = list(set(words))
self.scores[w][cat] = pd.DataFrame(index=words,
columns=['Mean', 'STD +/-'])
for word1 in words:
vals = []
for word2 in words:
if word1[0] != word2[0]:
try:
vals.append(self.df[self.ind.index(word1[0])][
self.ind.index(word2[0])])
except ValueError:
continue
# scores[win][cat].ix[word1, 'Gloss'] = word1[1]
#try:
# self.scores[w][cat].ix[word1, 'Mean'] = np.mean(vals)
# self.scores[w][cat].ix[word1, 'STD +/-'] = (np.mean(vals)-mean)/std
#except ValueError:
#self.scores[w][cat].drop_duplicates(inplace=True)
self.scores[w][cat].ix[word1, 'Mean'] = np.mean(vals)
self.scores[w][cat].ix[word1, 'STD +/-'] = (np.mean(
vals) - mean) / std
print('Total words: {0}'.format(self.words_no_93))
def AveCalc(self, w):
"""
calculates the average cosine-similarity score and the z-score of this
score for each window size, returning a single score for each window
:param w: the window size under investigation
:type w: int
:return: self.averages - the average for all words for each window size
:rtype: dict
:return: self.ave_no_93 - the average for all words not in domain 93 (proper names)
:rtype: dict
"""
total_std = 0
total_mean = 0
total_no_93_std = 0
total_no_93_mean = 0
for cat in self.scores[w]:
total_mean += self.scores[w][cat].ix[:, 'Mean'].fillna(0).sum()
total_std += self.scores[w][cat].ix[:, 'STD +/-'].fillna(0).sum()
if cat[0] != 93:
total_no_93_mean += self.scores[w][cat].ix[:, 'Mean'].fillna(
0).sum()
total_no_93_std += self.scores[w][cat].ix[:, 'STD +/-'].fillna(
0).sum()
self.averages[w] = (
total_mean / self.tot_words, total_std / self.tot_words)
self.ave_no_93[w] = (total_no_93_mean / self.words_no_93,
total_no_93_std / self.words_no_93)
def WriteFiles(self):
raise NotImplementedError('WriteFiles is not implemented on CatSim. '
'Instead, use a sub-class (CatSimWin or '
'CatSimSVD).')
def WriteLines(self, save_file, w_size, svd_exp, lems):
"""
writes the lines of the output csv files
:param save_file: the file path and name for the output file
:type save_file: str
:param w_size: the window size under investigation
:type w_size: int
:param svd_exp: Caron's exponent under investigation (1.0 for none)
:type svd_exp: float
:param lems: whether the data is based on a lemmatized corpus or not
:type lems: bool
:return: a file with the scores for every words in its categories
:rtype: tab-delimited csv file
"""
with open(save_file, mode='w', encoding='utf-8') as file:
file.write(
'Scores for Window Size {0}, SVD Exponent {1}\n'.format(w_size,
svd_exp))
file.write(
'Category\tWord\tGloss\t# of Occurrences\tMean CS with Category\t'
'Standard Deviations +/- Average\n')
if self.rng_type == 'win':
key = w_size
else:
key = svd_exp
for cat in sorted(self.scores[key].keys()):
for w in self.scores[key][cat].index:
try:
cnt = lems[w[0]]
except KeyError:
cnt = '?'
try:
file.write(
'{0}.{1}-{2} {3}\t{4}\t{5}\t{6}\t{7}\t{8}\n'.format
(
cat[0],
cat[1],
cat[2],
self.ln[cat]['gloss'].replace(',', ' '),
w[0],
w[1].replace(',', ' '),
cnt,
self.scores[key][cat].ix[w, 'Mean'][0],
self.scores[key][cat].ix[w, 'STD +/-'][0]
)
)
except IndexError:
file.write(
'{0}.{1}-{2} {3}\t{4}\t{5}\t{6}\t{7}\t{8}\n'.format
(
cat[0],
cat[1],
cat[2],
self.ln[cat]['gloss'].replace(',', ' '),
w[0],
w[1].replace(',', ' '),
cnt,
self.scores[key][cat].ix[w, 'Mean'],
self.scores[key][cat].ix[w, 'STD +/-']
)
)
def CatSimPipe(self):
"""
convenience function to guide the calculation and output process for all window sizes
:return: None
:rtype: None
"""
for w in self.rng:
self.LoadDF(w)
self.SimCalc(w)
self.AveCalc(w)
self.WriteFiles()
prob_words = list(set(self.prob_words))
good_words = list(set(self.good_words))
print('prob_words', len(prob_words), prob_words[:10])
print('good_words', len(good_words), good_words[:10])
print(self.averages, self.ave_no_93)
[docs]class CatSimWin(CatSim):
""" Calculates the average similarity and the z-score of this similarity
for all words that share the same semantic sub-domains in the Louw-Nida
lexicon
:param algo: the significance algorithm used to produce the cosine-similarity matrices used
:type algo: str
:param rng: the individual window sizes used for the discrete calculations
:type rng: list
:param lems: whether the input matrices were calculated with lemmatized texts or not
:type lems: bool
:param CS_dir: the directory path where the cosine-similarity matrices are located
:type CS_dir: str
:param dest_dir: the diretory path to save the results
:type dest_dir: str
:param sim_algo: which similarity algorithm was used in the calculations
:type sim_algo: str
:param corpus: tuple with the name of the corpus (str), the minimum number
of occurrences used (int), Caron's svd exponent (float - 1.0 if none was used),
and whether stop words were included (bool)
:type corpus: tuple
:param lem_file: the file path and filename of the occurrence dictionary
pickle that shows the number of time each word occurs in the corpus
:type lem_file: str
"""
def __init__(self, algo, rng, lems=False, CS_dir=None, dest_dir=None,
sim_algo=None, corpus=('SBL_GNT_books', 1, 1.0, True),
lem_file=None):
try:
self.ln = pd.read_pickle('Chapter_2/LN_Cat_Dict.pickle')
except FileNotFoundError:
ln_file = tk_control(
"askopenfilename(title='Where is your Louw-Nida dictionary pickle?')")
self.ln = pd.read_pickle(ln_file)
self.scores = {}
self.averages = {}
self.ave_no_93 = {}
self.good_words = []
self.prob_words = []
self.rng_type = 'win'
self.rng = rng
self.algo = algo
self.CS_dir = CS_dir
self.dest_dir = dest_dir
self.corpus = corpus
self.lems = lems
self.lem_file = lem_file
self.sim_algo = sim_algo
# self.prob_word_replace are the words that have different unicode representations
# in the Louw-Nida data and the corpus data. The key is the word as it is in
# Louw-Nida, the value as it is in the corpus.
self.prob_word_replace = {'περιΐστημι': 'περιΐστημι',
'προΐστημι': 'προΐστημι',
'παρατεινω': 'παρατείνω',
'μήπως': '',
'ταβέρνη': 'Ταβέρνη',
'ἀναμάρτητος': '',
'προσεγγίζω': '',
'ηλι': 'ἠλί',
'δανειστής': 'δανιστής',
'κατακύπτω': '',
'πρωΐ': 'πρωΐ',
'τυρβάζω': '',
'Θυάτιρα': 'Θυάτειρα',
'τετράπουν': 'τετράπους',
'ἰουδαΐζω': 'ἰουδαΐζω',
'Τρωγύλλιον': '',
'σεβαστός': 'Σεβαστός',
'σωτήριον': 'σωτήριος',
'κακοπάθεια': 'κακοπαθία',
'προσανατίθεμαι': 'προσανατίθημι',
'Χερούβ': 'Χεροῦβ',
'πρωΐα': 'πρωΐα',
'Σαλίμ': 'Σαλείμ',
'νόσημα': '',
'διΐστημι': 'διΐστημι',
'Νικολαΐτης': 'Νικολαΐτης',
'αὐτόφωρος': '',
'ὅμιλος': '',
'μαρανα': '',
'Ματθάτ': 'Ματθάν',
'ὀρεινή': 'ὀρεινός',
'δευτερόπρωτος': '',
'ἔξεστι': 'ἔξεστι(ν)',
'τομός': 'τομώτερος',
'πειραω': 'πειράω',
'ῥυπαίνομαι': 'ῥυπαρεύω',
'Λεββαιος': '',
'ελωι': 'ἐλωΐ',
'πραΰς': 'πραΰς',
'ὑπερβαίνω': '',
'ὅποτε': 'ὁπότε',
'ἐνεός': 'ἐνέος',
'τεσσαρακονταετής': 'τεσσερακονταετής',
'ἀγαθοποιΐα': 'ἀγαθοποιΐα',
'Σάπφειρα': 'Σάπφιρα',
'σαβαχθανι': 'σαβαχθάνι',
'στοιχεῖα': 'στοιχεῖον',
'συζήτησις': '',
'ἐκτίθεμαι': 'ἐκτίθημι',
'Κλῆμης': 'Κλήμης',
'τάραχή': 'τάραχος',
'Πάρθοι': 'Πάρθος',
'λεμα': 'λεμά',
'Λωΐς': 'Λωΐς',
'Βηθζαθά': 'Βηθεσδά',
'πίμπραμαι': 'πίμπρημι',
'Σεμεΐν': 'Σεμεΐν',
'Νεάπολις': '',
'Νύμφας': 'Νύμφα',
'πλείων': 'πολύς',
'Πτολεμαΐς': 'Πτολεμαΐς',
'Βηθαβαρα': '',
'Ῥωμαικος': '',
'ἐκλανθάνομαι': 'ἐκλανθάνω',
'Βενιαμείν': 'Βενιαμίν',
'Ἰουνιᾶς': 'Ἰουνία',
'πραΰτης': 'πραΰτης',
'ῥεδή': 'ῥέδη',
'καταβιβαζω': 'καταβαίνω',
'σπόριμα': 'σπόριμος',
'Ἀχαΐα': 'Ἀχαΐα',
'εὐποιΐα': 'εὐποιΐα',
'ἀνάτεμα': 'ἀνάθεμα',
'ὀσφῦς': 'ὀσφύς',
'κηριον': '',
'Ναΐν': 'Ναΐν',
'θα': 'θά',
'δήποτε': '',
'θρῆνος': '',
'Ἠσαΐας': 'Ἠσαΐας',
'καταγράφω': '',
'Καλοι Λιμένης': '',
'πραϋπάθεια': 'πραϋπαθία',
'τεσσαράκοντα': 'τεσσεράκοντα',
'μελισσιος': '',
'Γεργεσηνός': 'Γερασηνός',
'Ἑβραικός': '',
'ἅλς': '',
'Φάρες': 'Φαρές',
'λόγια': 'λογεία',
'ἀΐδιος': 'ἀΐδιος',
'ἀγραύλεω': 'ἀγραυλέω',
'νεομηνία': 'νουμηνία',
'Ἑβραΐς': 'Ἑβραΐς',
'Ἀβραάμ': 'Ἀβραάμ'.lower()}
def LoadDF(self, w):
"""
loads the appropriate cosine-similarity matrix for the window size being calculated
:param w: window size under investigation
:type w: int
:return: self.ind - the list of words in the corpus
:rtype: list
:return: self.df - the cosine-similarity matrix
:rtype: Pandas DataFrame or np.memmap (DataFrames will be phased out in future versions)
"""
file = '/media/matt/Data/DissProject/Data/SBL_GNT_books/{0}/CS_{1}_{0}_SBL_GNT_books_lems=True_min_occ=None_SVD_exp=1.hd5'.format(
str(w), self.algo)
try:
self.df = pd.read_hdf(file, 'df')
except FileNotFoundError:
file = tk_control(
"askopenfilename(title='Where is your pickle file for window = {0}, svd exponent = {1}'.format(str(w), 'None'))")
self.df = pd.read_pickle(file)
except OSError:
file = '{3}/{0}/{1}_{8}_{0}_lems={2}_{4}_min_occ={5}_SVD_exp={6}_no_stops=False_weighted={7}.dat'.format(
str(w), self.algo, self.lems, self.CS_dir, self.corpus[0],
self.corpus[1], self.corpus[2], self.corpus[3], self.sim_algo)
self.ind = pd.read_pickle(
'{0}/{2}/{1}_IndexList_w={2}_lems={3}_min_occs={4}_no_stops=False.pickle'.format(
self.CS_dir, self.corpus[0], str(w), self.lems,
self.corpus[1], self.corpus[3]))
self.df = np.memmap(file, dtype='float', mode='r',
shape=(len(self.ind), len(self.ind)))
def WriteFiles(self):
"""
writes the results of the similarity calculations to 6 individual files
:return: LN_Word_Cat_Scores pickle - the individual CS and z-score for every word in every category in which it is listed
:rtype: pickled dictionary of pandas DataFrames
:return: LN_Word_Cat_Scores CSV - the individual CS and z-score for every word in every category in which it is listed
:rtype: csv file
:return: LN_Window_Averages pickle - dictionary of mean CS score and z-score for each window size tested
:rtype: pickled dictionary
:return: LN_Window_Averages csv - mean CS score and z-score for each window size tested
:rtype: csv file
:return: LN_Window_Averages_no_93 pickle - dictionary of mean CS score and z-score for each window size tested
excluding Louw-Nida category 93 (proper names)
:rtype: pickled dictionary
:return: LN_Window_Averages_no_93 csv - mean CS score and z-score for each window size tested
excluding Louw-Nida category 93 (proper names)
:rtype: csv file
"""
with open(
'{2}/{4}_LN_Word_Cat_Scores_{0}_rng={1}_lems={3}_weighted={5}.pickle'.format(
self.algo, self.rng, self.dest_dir, self.lems,
self.corpus[0], self.corpus[3]), mode='wb') as file:
dump(self.scores, file)
if self.lem_file:
lems = pd.read_pickle(self.lem_file)
else:
lems = {}
for w_size in self.scores.keys():
save_file = '{3}/{5}_{7}_LN_Window={0}_Word_Cat_Scores_SVD_exp={1}_{2}_lems={4}_weighted={6}.csv'.format(
str(w_size), 'None', self.algo, self.dest_dir, self.lems,
self.corpus[0], self.corpus[3], self.sim_algo)
self.WriteLines(save_file, w_size, 'None', lems)
with open(
'{2}/{4}_{6}_LN_Window_Averages_{0}_lems={3}_rng={1}_weighted={5}.pickle'.format(
self.algo, self.rng, self.dest_dir, self.lems,
self.corpus[0], self.corpus[3], self.sim_algo),
mode='wb') as file:
dump(self.averages, file)
with open(
'{2}/{4}_{6}_LN_Window_Averages_{0}_lems={3}_rng={1}_weighted={5}.csv'.format(
self.algo, self.rng, self.dest_dir, self.lems,
self.corpus[0], self.corpus[3], self.sim_algo),
mode='w',
encoding='utf-8') as file:
file.write(
'Average Number of Standard Deviations above or below Average '
'per window\n')
file.write('Window Size,Average,+/- Standard Deviations\n')
for w_size in sorted(self.averages.keys()):
file.write(
'{0},{1},{2}\n'.format(w_size, self.averages[w_size][0],
self.averages[w_size][1]))
with open(
'{2}/{4}_{6}_LN_Window_Averages_no_93_SVD_{0}_lems={3}_rng={1}_weighted={5}.pickle'.format(
self.algo, self.rng, self.dest_dir, self.lems,
self.corpus[0], self.corpus[3], self.sim_algo),
mode='wb') as file:
dump(self.ave_no_93, file)
with open(
'{2}/{4}_{6}_LN_Window_Averages_no_93_SVD_{0}_lems={3}_rng={1}_weighted={5}.csv'.format(
self.algo, self.rng, self.dest_dir, self.lems,
self.corpus[0], self.corpus[3], self.sim_algo),
mode='w',
encoding='utf-8') as file:
file.write(
'Average Number of Standard Deviations above or below Average '
'per window excluding LN Category 93 (Names)\n')
file.write('Window Size,Average +/- Standard Deviations\n')
for w_size in sorted(self.ave_no_93.keys()):
file.write(
'{0},{1},{2}\n'.format(w_size, self.ave_no_93[w_size][0],
self.ave_no_93[w_size][1]))
class CatSimSVD(CatSim):
def __init__(self, rng, win, algo):
'''
This class calculates the average score for each Louw-Nida category
based on the SVD exponent that is used.
:param rng:
:param win:
:param algo:
:return:
'''
try:
self.ln = pd.read_pickle('Data/Chapter_2/LN_Cat_Dict.pickle')
except FileNotFoundError:
ln_file = tk_control(
"askopenfilename(title='Where is your Louw-Nida dictionary pickle?')")
self.ln = pd.read_pickle(ln_file)
self.scores = {}
self.averages = {}
self.ave_no_93 = {}
self.good_words = []
self.prob_words = []
self.rng_type = 'svd'
self.rng = rng
self.win = win
self.algo = algo
self.prob_word_replace = {'περιΐστημι': 'περιΐστημι',
'προΐστημι': 'προΐστημι',
'παρατεινω': 'παρατείνω',
'μήπως': '',
'ταβέρνη': 'Ταβέρνη',
'ἀναμάρτητος': '',
'προσεγγίζω': '',
'ηλι': 'ἠλί',
'δανειστής': 'δανιστής',
'κατακύπτω': '',
'πρωΐ': 'πρωΐ',
'τυρβάζω': '',
'Θυάτιρα': 'Θυάτειρα',
'τετράπουν': 'τετράπους',
'ἰουδαΐζω': 'ἰουδαΐζω',
'Τρωγύλλιον': '',
'σεβαστός': 'Σεβαστός',
'σωτήριον': 'σωτήριος',
'κακοπάθεια': 'κακοπαθία',
'προσανατίθεμαι': 'προσανατίθημι',
'Χερούβ': 'Χεροῦβ',
'πρωΐα': 'πρωΐα',
'Σαλίμ': 'Σαλείμ',
'νόσημα': '',
'διΐστημι': 'διΐστημι',
'Νικολαΐτης': 'Νικολαΐτης',
'αὐτόφωρος': '',
'ὅμιλος': '',
'μαρανα': '',
'Ματθάτ': 'Ματθάν',
'ὀρεινή': 'ὀρεινός',
'δευτερόπρωτος': '',
'ἔξεστι': 'ἔξεστι(ν)',
'τομός': 'τομώτερος',
'πειραω': 'πειράω',
'ῥυπαίνομαι': 'ῥυπαρεύω',
'Λεββαιος': '',
'ελωι': 'ἐλωΐ',
'πραΰς': 'πραΰς',
'ὑπερβαίνω': '',
'ὅποτε': 'ὁπότε',
'ἐνεός': 'ἐνέος',
'τεσσαρακονταετής': 'τεσσερακονταετής',
'ἀγαθοποιΐα': 'ἀγαθοποιΐα',
'Σάπφειρα': 'Σάπφιρα',
'σαβαχθανι': 'σαβαχθάνι',
'στοιχεῖα': 'στοιχεῖον',
'συζήτησις': '',
'ἐκτίθεμαι': 'ἐκτίθημι',
'Κλῆμης': 'Κλήμης',
'τάραχή': 'τάραχος',
'Πάρθοι': 'Πάρθος',
'λεμα': 'λεμά',
'Λωΐς': 'Λωΐς',
'Βηθζαθά': 'Βηθεσδά',
'πίμπραμαι': 'πίμπρημι',
'Σεμεΐν': 'Σεμεΐν',
'Νεάπολις': '',
'Νύμφας': 'Νύμφα',
'πλείων': 'πολύς',
'Πτολεμαΐς': 'Πτολεμαΐς',
'Βηθαβαρα': '',
'Ῥωμαικος': '',
'ἐκλανθάνομαι': 'ἐκλανθάνω',
'Βενιαμείν': 'Βενιαμίν',
'Ἰουνιᾶς': 'Ἰουνία',
'πραΰτης': 'πραΰτης',
'ῥεδή': 'ῥέδη',
'καταβιβαζω': 'καταβαίνω',
'σπόριμα': 'σπόριμος',
'Ἀχαΐα': 'Ἀχαΐα',
'εὐποιΐα': 'εὐποιΐα',
'ἀνάτεμα': 'ἀνάθεμα',
'ὀσφῦς': 'ὀσφύς',
'κηριον': '',
'Ναΐν': 'Ναΐν',
'θα': 'θά',
'δήποτε': '',
'θρῆνος': '',
'Ἠσαΐας': 'Ἠσαΐας',
'καταγράφω': '',
'Καλοι Λιμένης': '',
'πραϋπάθεια': 'πραϋπαθία',
'τεσσαράκοντα': 'τεσσεράκοντα',
'μελισσιος': '',
'Γεργεσηνός': 'Γερασηνός',
'Ἑβραικός': '',
'ἅλς': '',
'Φάρες': 'Φαρές',
'λόγια': 'λογεία',
'ἀΐδιος': 'ἀΐδιος',
'ἀγραύλεω': 'ἀγραυλέω',
'νεομηνία': 'νουμηνία',
'Ἑβραΐς': 'Ἑβραΐς'}
def LoadDF(self, w):
file = '/media/matt/Data/DissProject/Data/SBL_GNT_books/{0}/CS_{2}_{0}_SBL_GNT_books_lems=True_min_occ=None_SVD_exp={1}.hd5'.format(
self.win, w, self.algo)
try:
self.df = pd.read_hdf(file, 'df')
except FileNotFoundError:
file = tk_control(
"askopenfilename(title='Where is your pickle file for window = {0}, svd exponent = {1}'.format('350', w))")
self.df = pd.read_pickle(file)
def WriteFiles(self):
with open(
'Data/Chapter_2/per_book/LN_Word_Cat_Scores_SVD_{0}.pickle'.format(
self.algo), mode='wb') as file:
dump(self.scores, file)
lems = pd.read_pickle('Data/SBLGNT_lem_dict.pickle')
for w_size in self.scores.keys():
save_file = 'Data/Chapter_2/per_book/LN_Window={0}_Word_Cat_Scores_{2}_SVD_exp={1}.csv'.format(
self.win, str(w_size), self.algo)
self.WriteLines(save_file, '350', w_size, lems)
with open(
'Data/Chapter_2/per_book/LN_Window_Averages_SVD_{0}.pickle'.format(
self.algo), mode='wb') as file:
dump(self.averages, file)
with open(
'Data/Chapter_2/per_book/LN_Window_Averages_SVD_{0}.csv'.format(
self.algo),
mode='w',
encoding='utf-8') as file:
file.write(
'Average Number of Standard Deviations above or below Average '
'per SVD Exponent\n')
file.write('SVD Exponent,Average,+/- Standard Deviations\n')
for w_size in sorted(self.averages.keys()):
file.write('{0},{1}\n'.format(w_size, self.averages[w_size]))
with open(
'Data/Chapter_2/per_book/LN_Window_Averages_no_93_SVD_{0}.pickle'.format(
self.algo), mode='wb') as file:
dump(self.ave_no_93, file)
with open(
'Data/Chapter_2/per_book/LN_Window_Averages_no_93_SVD{0}.csv'.format(
self.algo),
mode='w',
encoding='utf-8') as file:
file.write(
'Average Number of Standard Deviations above or below Average '
'per window excluding LN Category 93 (Names)\n')
file.write('Window Size,Average +/- Standard Deviations\n')
for w_size in sorted(self.ave_no_93.keys()):
file.write('{0},{1}\n'.format(w_size, self.ave_no_93[w_size]))
class WordCatFinder(CatSim):
def __init__(self, words):
if type(words) != list:
raise TypeError('"words" must be a list')
try:
self.ln = pd.read_pickle('Data/Chapter_2/LN_Cat_Dict.pickle')
except FileNotFoundError:
ln_file = tk_control(
"askopenfilename(title='Where is your Louw-Nida dictionary pickle?')")
self.ln = pd.read_pickle(ln_file)
self.scores = {}
self.averages = {}
self.ave_no_93 = {}
self.good_words = []
self.prob_words = []
self.rng = words
self.prob_word_replace = {'περιΐστημι': 'περιΐστημι',
'προΐστημι': 'προΐστημι',
'παρατεινω': 'παρατείνω',
'μήπως': '',
'ταβέρνη': 'Ταβέρνη',
'ἀναμάρτητος': '',
'προσεγγίζω': '',
'ηλι': 'ἠλί',
'δανειστής': 'δανιστής',
'κατακύπτω': '',
'πρωΐ': 'πρωΐ',
'τυρβάζω': '',
'Θυάτιρα': 'Θυάτειρα',
'τετράπουν': 'τετράπους',
'ἰουδαΐζω': 'ἰουδαΐζω',
'Τρωγύλλιον': '',
'σεβαστός': 'Σεβαστός',
'σωτήριον': 'σωτήριος',
'κακοπάθεια': 'κακοπαθία',
'προσανατίθεμαι': 'προσανατίθημι',
'Χερούβ': 'Χεροῦβ',
'πρωΐα': 'πρωΐα',
'Σαλίμ': 'Σαλείμ',
'νόσημα': '',
'διΐστημι': 'διΐστημι',
'Νικολαΐτης': 'Νικολαΐτης',
'αὐτόφωρος': '',
'ὅμιλος': '',
'μαρανα': '',
'Ματθάτ': 'Ματθάν',
'ὀρεινή': 'ὀρεινός',
'δευτερόπρωτος': '',
'ἔξεστι': 'ἔξεστι(ν)',
'τομός': 'τομώτερος',
'πειραω': 'πειράω',
'ῥυπαίνομαι': 'ῥυπαρεύω',
'Λεββαιος': '',
'ελωι': 'ἐλωΐ',
'πραΰς': 'πραΰς',
'ὑπερβαίνω': '',
'ὅποτε': 'ὁπότε',
'ἐνεός': 'ἐνέος',
'τεσσαρακονταετής': 'τεσσερακονταετής',
'ἀγαθοποιΐα': 'ἀγαθοποιΐα',
'Σάπφειρα': 'Σάπφιρα',
'σαβαχθανι': 'σαβαχθάνι',
'στοιχεῖα': 'στοιχεῖον',
'συζήτησις': '',
'ἐκτίθεμαι': 'ἐκτίθημι',
'Κλῆμης': 'Κλήμης',
'τάραχή': 'τάραχος',
'Πάρθοι': 'Πάρθος',
'λεμα': 'λεμά',
'Λωΐς': 'Λωΐς',
'Βηθζαθά': 'Βηθεσδά',
'πίμπραμαι': 'πίμπρημι',
'Σεμεΐν': 'Σεμεΐν',
'Νεάπολις': '',
'Νύμφας': 'Νύμφα',
'πλείων': 'πολύς',
'Πτολεμαΐς': 'Πτολεμαΐς',
'Βηθαβαρα': '',
'Ῥωμαικος': '',
'ἐκλανθάνομαι': 'ἐκλανθάνω',
'Βενιαμείν': 'Βενιαμίν',
'Ἰουνιᾶς': 'Ἰουνία',
'πραΰτης': 'πραΰτης',
'ῥεδή': 'ῥέδη',
'καταβιβαζω': 'καταβαίνω',
'σπόριμα': 'σπόριμος',
'Ἀχαΐα': 'Ἀχαΐα',
'εὐποιΐα': 'εὐποιΐα',
'ἀνάτεμα': 'ἀνάθεμα',
'ὀσφῦς': 'ὀσφύς',
'κηριον': '',
'Ναΐν': 'Ναΐν',
'θα': 'θά',
'δήποτε': '',
'θρῆνος': '',
'Ἠσαΐας': 'Ἠσαΐας',
'καταγράφω': '',
'Καλοι Λιμένης': '',
'πραϋπάθεια': 'πραϋπαθία',
'τεσσαράκοντα': 'τεσσεράκοντα',
'μελισσιος': '',
'Γεργεσηνός': 'Γερασηνός',
'Ἑβραικός': '',
'ἅλς': '',
'Φάρες': 'Φαρές',
'λόγια': 'λογεία',
'ἀΐδιος': 'ἀΐδιος',
'ἀγραύλεω': 'ἀγραυλέω',
'νεομηνία': 'νουμηνία',
'Ἑβραΐς': 'Ἑβραΐς'}
def LoadDF(self):
file = 'Data/350/PPMI_CS_350_SBL_GNT_SVD_exp=1.45.pickle'
try:
self.df = pd.read_pickle(file)
except FileNotFoundError:
file = tk_control(
"askopenfilename(title='Where is your pickle file for window = 350, svd exponent = 1.45')")
self.df = pd.read_pickle(file)
def SimCalc(self, w):
self.scores[w] = {}
mean, std = np.mean(self.df.values), np.std(self.df.values)
print(w)
self.scores[w] = pd.DataFrame(index=list(self.ln.keys()),
columns=['Mean', 'STD +/-'])
for cat in self.ln.keys():
vals = []
cat_words = []
for d in self.ln[cat]['words']:
word = list(d.keys())[0]
if word in self.df.index:
cat_words.append((word, d[word]))
self.good_words.append(word)
else:
new_word = self.prob_word_replace[word]
if new_word != '':
cat_words.append((new_word, d[word]))
self.prob_words.append(word)
for word1 in cat_words:
if word1[0] != w:
vals.append(self.df.ix[word1[0], w])
self.scores[w].ix[cat, 'Mean'] = np.mean(vals)
self.scores[w].ix[cat, 'STD +/-'] = (np.mean(vals) - mean) / std
def WriteFiles(self):
with open('Data/Chapter_2/LN_Word_Cat_Finder_SVD.pickle',
mode='wb') as file:
dump(self.scores, file)
# lems = pd.read_pickle('Data/SBLGNT_lem_dict.pickle')
save_file = 'Data/Chapter_2/LN_Word_Cat_Finder_SVD.csv'
self.WriteLines(save_file)
def WriteLines(self, save_file):
print('Writing Lines')
with open(save_file, mode='w', encoding='utf-8') as file:
file.write('Scores for Window Size 350; SVD Exponent 1.45\n')
file.write('Word,Category,Mean CS with Category,'
'Standard Deviations +/- Average\n')
for word in self.scores.keys():
for cat in sorted(self.scores[word].index):
try:
file.write(
'{0},{1}.{2}-{3} {4},{5},{6}\n'.format
(
word,
cat[0],
cat[1],
cat[2],
self.ln[cat]['gloss'].replace(',', ' '),
self.scores[word].ix[cat, 'Mean'][0],
self.scores[word].ix[cat, 'STD +/-'][0]
)
)
except IndexError:
file.write(
'{0},{1}.{2}-{3} {4},{5},{6}\n'.format
(
word,
cat[0],
cat[1],
cat[2],
self.ln[cat]['gloss'].replace(',', ' '),
self.scores[word].ix[cat, 'Mean'],
self.scores[word].ix[cat, 'STD +/-']
)
)
def CatSimPipe(self):
self.LoadDF()
for w in self.rng:
self.SimCalc(w)
self.WriteFiles()
class SynSimWin(CatSimWin):
def __init__(self, algo, num_syns, rng, syn_file=None, lems=False,
CS_dir=None, dest_dir=None,
corpus=('SBL_GNT_books', None, 1.0)):
'''
This class calculates the context window size that returns the best
average cosine similarity score based on synonym similarity data
:param algo:
:param num_syns:
:param rng:
:param syn_file:
:param lems:
:return:
'''
if syn_file == None:
syn_file = tk_control(
"askopenfilename(title='Where is your synonym DF?')")
try:
self.syn_df = pd.read_hdf(syn_file, 'CS')
except:
self.syn_df = pd.read_pickle(syn_file)
self.averages = {}
self.rng = rng
self.num_syns = num_syns
self.algo = algo
self.lems = lems
self.CS_dir = CS_dir
self.dest_dir = dest_dir
self.corpus = corpus
def LoadDF(self, w):
# need to implement support for the new .dat (memmap) files I am creating
file = '{3}/{0}/CS_{1}_{0}_lems={2}_{4}_min_occ={5}_SVD_exp={6}.hd5'.format(
str(w), self.algo, self.lems, self.CS_dir, self.corpus[0],
self.corpus[1], self.corpus[2])
try:
self.df = pd.read_hdf(file, 'df')
except FileNotFoundError:
file = tk_control(
"askopenfilename(title='Where is your pickle file for window = {0}, svd exponent = {1}'.format(str(w), 'None'))")
self.df = pd.read_pickle(file)
except OSError:
file = '{3}/{0}/{1}_CS_{0}_lems={2}_{4}_min_occ={5}_SVD_exp={6}.dat'.format(
str(w), self.algo, self.lems, self.CS_dir, self.corpus[0],
self.corpus[1], self.corpus[2])
self.ind = pd.read_pickle(
'{0}/{2}/{1}_IndexList_w={2}_lems={3}_min_occs={4}.pickle'.format(
self.CS_dir, self.corpus[0], str(w), self.lems,
self.corpus[1]))
self.df = np.memmap(file, dtype='float', mode='r',
shape=(len(self.ind), len(self.ind)))
def SimCalc(self, w):
if self.ind:
mean, std = np.mean(self.df), np.std(self.df)
else:
mean, std = np.mean(self.df.values), np.std(self.df.values)
print('%s average: %s, std: %s' % (w, mean, std))
vals = []
for word in self.top_syns.keys():
# top_syns = list(self.syn_df[word].order(ascending=False)[1:self.num_syns+1].index)
for word2 in self.top_syns[word]:
try:
if self.ind:
#this means we are using a memmap and not a DataFrame
vals.append(self.df[self.ind.index(word)][
self.ind.index(word2)])
else:
vals.append(self.df.ix[word, word2])
except KeyError:
continue
except ValueError:
continue
syn_mean = np.mean(vals)
syn_std = (np.mean(vals) - mean) / std
self.averages[w] = (syn_mean, syn_std)
def WriteFiles(self):
with open(
'{4}/{6}_Syn_Window_Averages_{0}_num_syns={1}_lems={3}_rng={2}_min_occs={5}.csv'.format(
self.algo,
self.num_syns,
self.rng,
self.lems,
self.dest_dir,
self.corpus[1],
self.corpus[0]),
mode='w',
encoding='utf-8') as file:
file.write(
'Average Number of Standard Deviations above or below Average '
'per window\n')
file.write('Window Size,Average,+/- Standard Deviations\n')
for w_size in sorted(self.averages.keys()):
file.write(
'{0},{1},{2}\n'.format(w_size, self.averages[w_size][0],
self.averages[w_size][1]))
def CatSimPipe(self):
# calculate the syn list once to speed up later processing
if type(self.syn_df) == dict:
self.top_syns = self.syn_df
else:
self.top_syns = {}
for word in self.syn_df.index:
self.top_syns[word] = list(
self.syn_df[word].order(ascending=False)[
1:self.num_syns + 1].index)
del self.syn_df
for w in self.rng:
self.LoadDF(w)
self.SimCalc(w)
self.WriteFiles()
print('Finished')
class SynSimSVD(SynSimWin):
def __init__(self, algo, num_syns, rng, win, CS_dir=None, syn_file=None,
lems=False):
'''
This class calculates the SVD exponent that returns the best
average cosine similarity score based on synonym similarity data
:param algo:
:param num_syns:
:param rng:
:param win:
:param syn_file:
:param lems:
:return:
'''
if syn_file == None:
syn_file = tk_control(
"askopenfilename(title='Where is your synonym DF?')")
self.syn_df = pd.read_hdf(syn_file, 'CS')
self.averages = {}
self.rng = rng
self.num_syns = num_syns
self.algo = algo
self.lems = lems
self.win = win
self.CS_dir = CS_dir
def LoadDF(self, w):
file = '{4}/{0}/CS_{1}_{0}_SBL_GNT_books_lems={2}_min_occ=None_SVD_exp={3}.hd5'.format(
self.win, self.algo, self.lems, str(w), self.CS_dir)
try:
self.df = pd.read_hdf(file, 'df')
except FileNotFoundError:
file = tk_control(
"askopenfilename(title='Where is your pickle file for window = {0}, svd exponent = {1}'.format(str(w), 'None'))")
self.df = pd.read_pickle(file)