#! /usr/bin/env python3
__author__ = 'matt'
import sys
import os
PACKAGE_PARENT = '..'
SCRIPT_DIR = os.path.dirname(
os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))))
sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))
import re
from collections import Counter
import datetime
from math import log, ceil
import pandas as pd
import numpy as np
try:
from Data_Production.TK_files import tk_control
except ImportError:
print('Tkinter cannot be used on this Python installation.\nPlease designate a list of files in the files variable.')
from sklearn.metrics.pairwise import pairwise_distances
from glob import glob
# from celery import group
# from proj.tasks import counter, svd_calc
from itertools import combinations_with_replacement
from pickle import dump
from multiprocessing import Pool
from Data_Production.multi_tasks import counter
import argparse
import shutil
[docs]class SemPipeline:
""" This class produces matrices representing cooccurrence counts, statistical significance, and similarity data for a corpus
:param win_size: context window size
:type win_size: int
:param lemmata: whether to use word lemmata
:type lemmata: bool
:param weighted: whether to use a weighted window type
:type weighted: bool
:param algo: the significance algorithm to use. 'LL' and 'PPMI' are implemented
:type algo: str
:param sim_algo: the similarity algorithm to use. 'CS' is implemented
:type sim_algo: str
:param files: the directory in which the individual .txt files are held
:type files: str
:param c: the number of cores to use in self.cooc_counter (will be removed in the future)
:type c: int
:param occ_dict: the path and filename for the occurrence dictionary pickle
:type occ_dict: str
:param min_count: the minimum occurrence count below which words will not be counted
:type min_count: int
:param jobs: number of jobs to use during the cosine similarity calculations
:type jobs: int
:param stops: whether to include stop words or not (True means to include them)
:type stops: bool
:ivar w: the context window size
:type w: int
:ivar lems: whether a lemmatized or unlemmatized text will be used
:type lems: bool
:ivar weighted: whether a weighted or unweighted context window will be used (True == weighted)
:type weighted: bool
:ivar algo: which significance algorithm will be used (PPMI or LL)
:type algo: str
:ivar sim_algo: the similarity algorithm to be used
:type sim_algo: str
:ivar dir: the directory path in which the texts are located
:type dir: str
:ivar c: the number of cores to use during co-occurrence counting
:type c: int
:ivar occ_dict: the location for the dictionary representing word counts for every word
:type occ_dict: str
:ivar min_count: the minimum threshold of occurrences for the words to be calculated
:type min_count: int
:ivar jobs: the value to be used for n_jobs in the cosine similarity calculations
:type jobs: int
:ivar stops: a list of stop-words to ignore during the calculations
:type stops: (str)
:ivar ind: the indices for the rows and columns of the matrix (i.e., the words) - filled in self.cooc_counter
:type ind: [str]
:ivar cols: the length of self.ind - filled in self.cooc_counter
:ivar cols: int
:ivar coll_df: transformed into numpy.memmap and filled in self.cooc_counter
:type coll_df: tuple
:ivar LL_df: transformed into numpy.memmap and filled in self.LL
:type LL_df: tuple
:ivar PPMI_df: transformed into numpy.memmap and filled in self.PPMI
:type PPMI_df: tuple
:ivar CS_df: transformed into numpy.memmap and filled in self.CS
:type CS_df: tuple
:ivar stat_df: filled with either self.PPMI_df or self.LL_df in self.CS
:type stat_df: tuple
:ivar dest: the destination directory for all files - filled in self.makeFileNames
:type dest: str
:ivar corpus: the name of the corpus under investigation - filled in self.makeFileNames
:type corpus: str
"""
def __init__(self, win_size=10, lemmata=True, weighted=True, algo='PPMI',
sim_algo='cosine', files=None, c=8, occ_dict=None,
min_count=1, jobs=1, stops=True, **kwargs):
""" This class produces matrices representing cooccurrence counts, statistical significance, and similarity data for a corpus
:param win_size: context window size
:type win_size: int
:param lemmata: whether to use word lemmata
:type lemmata: bool
:param weighted: whether to use a weighted window type
:type weighted: bool
:param algo: the significance algorithm to use. 'LL' and 'PPMI' are implemented
:type algo: str
:param sim_algo: the similarity algorithm to use. 'CS' is implemented
:type sim_algo: str
:param files: the directory in which the individual .txt files are held
:type files: str
:param c: the number of cores to use in self.cooc_counter (will be removed in the future)
:type c: int
:param occ_dict: the path and filename for the occurrence dictionary pickle
:type occ_dict: str
:param min_count: the minimum occurrence count below which words will not be counted
:type min_count: int
:param jobs: number of jobs to use during the cosine similarity calculations
:type jobs: int
:param stops: whether to include stop words or not (True means to include them)
:type stops: bool
:ivar w: the context window size
:type w: int
:ivar lems: whether a lemmatized or unlemmatized text will be used
:type lems: bool
:ivar weighted: whether a weighted or unweighted context window will be used (True == weighted)
:type weighted: bool
:ivar algo: which significance algorithm will be used (PPMI or LL)
:type algo: str
:ivar sim_algo: the similarity algorithm to be used
:type sim_algo: str
:ivar dir: the directory path in which the texts are located
:type dir: str
:ivar c: the number of cores to use during co-occurrence counting
:type c: int
:ivar occ_dict: the location for the dictionary representing word counts for every word
:type occ_dict: str
:ivar min_count: the minimum threshold of occurrences for the words to be calculated
:type min_count: int
:ivar jobs: the value to be used for n_jobs in the cosine similarity calculations
:type jobs: int
:ivar stops: a list of stop-words to ignore during the calculations
:type stops: (str)
:ivar ind: the indices for the rows and columns of the matrix (i.e., the words) - filled in self.cooc_counter
:type ind: [str]
:ivar cols: the length of self.ind - filled in self.cooc_counter
:ivar cols: int
:ivar coll_df: transformed into numpy.memmap and filled in self.cooc_counter
:type coll_df: np.memmap
:ivar LL_df: transformed into numpy.memmap and filled in self.LL
:type LL_df: np.memmap
:ivar PPMI_df: transformed into numpy.memmap and filled in self.PPMI
:type PPMI_df: np.memmap
:ivar CS_df: transformed into numpy.memmap and filled in self.CS
:type CS_df: np.memmap
:ivar stat_df: filled with either self.PPMI_df or self.LL_df in self.CS
:type stat_df: np.memmap
:ivar dest: the destination directory for all files - filled in self.makeFileNames
:type dest: str
:ivar corpus: the name of the corpus under investigation - filled in self.makeFileNames
:type corpus: str
"""
self.w = win_size
self.lems = lemmata
self.weighted = weighted
self.algo = algo
if sim_algo in ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
'manhattan']:
self.sim_algo = sim_algo
else:
print(
"The only accepted values for 'sim_algo' are 'cityblock', 'cosine', 'euclidean', 'l1', 'l2', or 'manhattan'")
print("Setting 'sim_algo' to 'cosine'")
self.sim_algo = 'cosine'
if self.algo not in ['PPMI', 'LL', 'both']:
print(
'The only accepted values for "algo" are "PPMI", "LL", or "both".')
print("Setting 'algo' to 'both'")
self.algo = 'both'
self.dir = files
self.c = c
if occ_dict == 'None':
self.occ_dict = None
else:
self.occ_dict = occ_dict
if min_count == 'None':
self.min_count = None
else:
self.min_count = min_count
self.jobs = jobs
if not stops:
self.stops = ('μή', 'ἑαυτοῦ', 'ἄν', 'ἀλλ’', 'ἀλλά', 'ἄλλος', 'ἀπό',
'ἄρα', 'αὐτός', 'δ’', 'δέ', 'δή', 'διά', 'δαί',
'δαίς', 'ἔτι', 'ἐγώ', 'ἐκ', 'ἐμός', 'ἐν', 'ἐπί',
'εἰ', 'εἰμί', 'εἴμι', 'εἰς', 'γάρ', 'γε', 'γα^', 'ἡ',
'ἤ', 'καί', 'κατά', 'μέν', 'μετά', 'μή', 'ὁ', 'ὅδε',
'ὅς', 'ὅστις', 'ὅτι', 'οὕτως', 'οὗτος', 'οὔτε',
'οὖν',
'οὐδείς', 'οἱ', 'οὐ', 'οὐδέ', 'οὐκ', 'περί', 'πρός',
'σύ', 'σύν', 'τά', 'τε', 'τήν', 'τῆς', 'τῇ', 'τι',
'τί', 'τις', 'τίς', 'τό', 'τοί', 'τοιοῦτος', 'τόν',
'τούς', 'τοῦ', 'τῶν', 'τῷ', 'ὑμός', 'ὑπέρ', 'ὑπό',
'ὡς', 'ὦ', 'ὥστε', 'ἐάν', 'παρά', 'σός')
else:
self.stops = ()
# the following ivars are filled later in the class
self.ind = []
self.cols = 0
self.coll_df = ()
self.LL_df = ()
self.PPMI_df = ()
self.CS_df = ()
self.stat_df = ()
self.dest = ''
self.corpus = ''
def file_chooser(self):
""" Uses tkinter.filedialog, as implemented in the tk_control class to fill self.dir if files=None
"""
self.dir = tk_control("askdirectory(title='In which directory are the XML file(s) would you like to analyze?')")
def produce_file_names(self, step):
"""
:param step: The step of the process, used to generate the correct file name
:type step: str
:return: The file name
:rtype: str
"""
return '{dest}/{step}_{win}_lems={lems}_{corpus}_min_occ={min_count}_no_stops={stops}_weighted={weighted}.dat'.format(
dest=self.dest, step=step, win=str(self.w), lems=self.lems, corpus=self.corpus, min_count=self.min_count,
stops=bool(self.stops), weighted=self.weighted)
def word_extract(self, text, pattern, stops=()):
""" Extracts a list of words from self.t
:return: list of words
:rtype: list
"""
words = []
for line in text:
word = re.sub(pattern, r'\1', line)
if word != '' and word not in stops:
words.append(word)
return words
def word_counter(self, words, counts, min_lems=set()):
"""
:param words:
:type words: list
:param min_lems:
:type min_lems: set
:return: the co-occurrence counts for the files
:rtype: Counter
"""
step = ceil(len(words) / self.c)
steps = []
for i in range(self.c):
steps.append((step * i, min(step * (i + 1), len(words))))
'''self.res = group(
counter.s(self.weighted, self.w, words, limits) for limits in
steps)().get()
'''
# res = []
with Pool(processes=self.c) as pool:
# for limits in steps:
results = pool.starmap(counter, [(self.weighted, self.w, words, limits) for limits in steps])
# res.append(results.get())
# since the counter task returns Counter objects, the update method
# below adds instead of replacing the values
for r in results:
for key in r.keys():
if key not in min_lems:
if key in counts.keys():
counts[key].update(r[key])
else:
counts[key] = r[key]
return counts
def cooc_counter(self):
""" Counts the number of times each word co-occurs with each other word
:ivar ind: the words that represent the ordered indices of all matrices produced in later calculation
:type ind: [int]
:ivar coll_df: self.coll_df
:type coll_df: numpy.memmap
"""
cooc_dest = self.produce_file_names('COOC')
index_dest = self.produce_file_names('Index').replace('.dat', '.pickle')
# Check to see if a cooccurrence file already exists, if so, exit the method
if os.path.isfile(cooc_dest):
self.ind = pd.read_pickle(index_dest)
self.cols = len(self.ind)
self.coll_df = np.memmap(cooc_dest, dtype='float', mode='r',
shape=(len(self.ind), self.cols))
return
# Initialize local variables
counts = Counter()
if self.lems:
pattern = re.compile(r'.+?lem="([^"]*).*')
else:
pattern = re.compile(r'.+?>([^<]*).*')
if self.occ_dict:
occs = pd.read_pickle(self.occ_dict)
min_lems = set([w for w in occs if occs[w] < self.min_count])
del occs
else:
min_lems = set()
# Compute co-occurrence counts for each file in self.dir
for file in glob('{0}/*.txt'.format(self.dir)):
with open(file) as f:
words = self.word_extract(f.read().lower().split('\n'), pattern, self.stops)
counts = self.word_counter(words, counts, min_lems)
# Fill the ivars that come from the co-occurrence counts
self.ind = list(counts.keys())
self.cols = len(self.ind)
# Write the counts dictionary to a numpy.memmap file
print('Now writing cooccurrence file at {0}'.format(datetime.datetime.now().time().isoformat()))
self.coll_df = np.memmap(cooc_dest, dtype='float', mode='w+', shape=(self.cols, self.cols))
for i, w in enumerate(self.ind):
s = pd.Series(counts[w], index=self.ind,
dtype=np.float64).fillna(0)
self.coll_df[i] = s.values
if i % 5000 == 0:
os.system('echo COOC {0}% done'.format((i / self.cols * 100)))
del self.coll_df
self.coll_df = np.memmap(cooc_dest, dtype='float', mode='r+',
shape=(
self.cols, self.cols))
# Re-open self.coll_df as read-only
del self.coll_df
self.coll_df = np.memmap(cooc_dest, dtype='float', mode='r', shape=(self.cols, self.cols))
# Save the index list and the column list
with open(index_dest, mode='wb') as f:
dump(self.ind, f)
def log_L(self, k, n, x):
""" Calculates the values for the individual elements of the Log-likelihood equation using the
binomial distribution function L(k,n,x) = (x**k)*(1-x)**(n-k).
:param k:
:type k: pandas.Series
:param n:
:type n: numpy.ndarray
:param x:
:type x: numpy.ndarray
:return: Log-likelihood values
:rtype: numpy.ndarray
"""
return np.log(np.power(np.float64(x), k)
* np.power(np.float64(1 - x), n - k))
def log_space_L(self, k, n, x):
""" Calculates the values for the individual elements of the Log-likelihood equation using the
binomial distribution function L(k,n,x) = (x**k)*(1-x)**(n-k).
Moves the calculations to log-space to deal with floats that are too small for float64.
:param k:
:type k: numpy.ndarray
:param n:
:type n: numpy.ndarray
:param x:
:type x: numpy.ndarray
:return: Log-likelihood values
:rtype: numpy.ndarray
"""
return np.log(x) * (k) + (np.log(1 - x) * (n - k))
def log_like(self, row, C2, P, N):
""" Guides the process of Log-likelihood calculations for a single row
:param row: the index of the row in the table to be calculated
:type row: int
:param C2: number of co-occurrences for each row of the table
:type C2: numpy.ndarray
:param P: ratio of co-occurrences per row to total co-occurrences in the table
:type P: numpy.ndarray
:param N: total number of co-occurrences in the table
:type N: float
:return: Log-likelihood values for a single row in the table
:rtype: numpy.ndarray
"""
C12 = self.coll_df[row]
C1 = np.sum(C12)
# P1 is ratio of single co-occurrence values to the total co-occurrences for that row
P1 = C12 / C1
# P2 ratio of total co-occurrences for a word minus the co-occurrences
# with the word in question to the total number of co-occurrences in
# the table minus the total co-occurrences for the row.
P2 = (C2 - C12) / (N - C1)
LL1 = self.log_space_L(C12, C1, P)
LL2 = self.log_space_L(C2 - C12, N - C1, P)
LL3 = self.log_L(C12, C1, P1)
# The following finds all inf and -inf values in LL3 by moving calculations into log space.
LL3_inf = np.where(abs(LL3) == np.inf)
if len(LL3_inf) > 0:
for ind in LL3_inf[0]:
try:
LL3[ind] = (log(P1[ind]) * C12[ind]) + (
log(1 - P1[ind]) * (C1 - C12[ind]))
except ValueError:
LL3[ind] = 0
LL4 = self.log_space_L(C2 - C12, N - C1, P2)
# The following finds all inf and -inf values in LL4 by moving calculations into log space.
LL4_inf = np.where(abs(LL4) == np.inf)
if len(LL4_inf) > 0:
for ind in LL4_inf[0]:
try:
LL4[ind] = self.log_L((C2[ind] - C12[ind]), (N - C1),
P2[ind])
except ValueError:
LL4[ind] = 0
a = -2 * (LL1 + LL2 - LL3 - LL4)
# a[np.where(np.isfinite(a) == False)] = 0
return a
def LL(self):
""" Guides the Log-likelihood calculations for the whole matrix
:ivar LL_df: matrix of log-likelihood values
:type LL_df: numpy.memmap
"""
dest_file = self.produce_file_names('LL')
# If a log-likelihood file exists already for these parameters, exit the method
if os.path.isfile(dest_file):
self.LL_df = np.memmap(dest_file, dtype='float', mode='r',
shape=(self.cols, self.cols))
return
# Initialize local variables
n = np.sum(self.coll_df)
c2 = np.sum(self.coll_df, axis=0)
p = c2 / n
# Fill self.LL_df with log-likelihood values
self.LL_df = np.memmap(dest_file, dtype='float', mode='w+',
shape=(self.cols, self.cols))
for i in range(self.cols):
self.LL_df[i] = self.log_like(i, c2, p, n)
if i % 5000 == 0:
os.system('echo LL {0}% done'.format((i / self.cols * 100)))
del self.LL_df
self.LL_df = np.memmap(dest_file, dtype='float', mode='r+',
shape=(self.cols, self.cols))
# Change all numpy.nan and numpy.inf values to 0
# This is necessary for later calculations that will raise errors for non-finite values
#self.LL_df[np.where(np.isfinite(self.LL_df) == False)] = 0
# Dump memory and reload self.LL_df as read-only
del self.LL_df
self.LL_df = np.memmap(dest_file, dtype='float', mode='r', shape=(self.cols, self.cols))
def PMI_calc(self, row, P2, N):
""" Calculates PPMI values for one table row
:param row: index for the word's row in the table
:type row: int
:param P2: ratio of co-occurrences per row to total co-occurrences in the table
:type P2: Numpy ndarray
:param N: total co-occurrences in the table
:type N: float
:return: PPMI values for a row in the table
:rtype: Numpy ndarray
"""
C12 = self.coll_df[row]
# C1 is the total co-occurrences in the row
C1 = np.sum(C12)
# P1 is the probability that the word co-occurs
P1 = C1 / N
# P12 is a vector of the probabilities that the word occurs with any other word
P12 = C12 / N
a = np.log2(np.divide(P12, P1 * P2))
a[np.where(np.isfinite(a) == False)] = 0
a[a < 0] = 0
return a
def PPMI(self):
""" Guides the PPMI calculation process for the whole table
:ivar PPMI_df: matrix of PPMI values
:type PPMI_df: numpy.memmap
"""
dest_file = self.produce_file_names('PPMI')
# If a PPMI file already exists, exit the method
if os.path.isfile(dest_file):
self.PPMI_df = np.memmap(dest_file, dtype='float', mode='r',
shape=(self.cols, self.cols))
return
# Initialize local variables
n = np.sum(self.coll_df)
#values for C2
p2 = np.sum(self.coll_df, axis=0) / n
# Fill self.PPMI_df with values
self.PPMI_df = np.memmap(dest_file, dtype='float', mode='w+', shape=(self.cols, self.cols))
for i, w in enumerate(self.ind):
self.PPMI_df[i] = self.PMI_calc(i, p2, n)
if i % 5000 == 0:
os.system('echo PPMI {0}% done'.format((i / self.cols * 100)))
del self.PPMI_df
self.PPMI_df = np.memmap(dest_file, dtype='float', mode='r+',
shape=(self.cols, self.cols))
# Change all numpy.nan and numpy.inf values to 0
# This is necessary for later calculations that will raise errors for non-finite values
#self.PPMI_df[np.where(np.isfinite(self.PPMI_df) == False)] = 0
# Dump memory and reload self.PPMI_df as read-only
del self.PPMI_df
self.PPMI_df = np.memmap(dest_file, dtype='float', mode='r',
shape=(self.cols, self.cols))
def CS(self, algorithm):
""" Calculates the cosine similarity of every matrix row with every other row
:param algorithm: which algorithm (PPMI or LL) is being tested
:type algorithm: str
:param e: SVD exponent
:type e: float
:ivar CS_df: matrix of cosine similarity values
:type CS_df: numpy.memmap
"""
print('Starting {} calculations for {} for '
'w={}, lem={}, weighted={} at {}'.format(self.sim_algo,
self.corpus,
str(self.w),
self.lems,
self.weighted,
datetime.datetime.now().time().isoformat()))
dest_file = self.produce_file_names('CS_{}'.format(algorithm))
if os.path.isfile(dest_file):
return
if algorithm == 'PPMI':
self.stat_df = self.PPMI_df
self.stat_file = self.produce_file_names('PPMI')
elif algorithm == 'LL':
self.stat_df = self.LL_df
self.stat_file = self.produce_file_names('LL')
self.CS_df = np.memmap(dest_file, dtype='float', mode='w+',
shape=(self.cols, self.cols))
if self.sim_algo == 'cosine':
self.CS_df[:] = 1 - pairwise_distances(self.stat_df,
metric=self.sim_algo,
n_jobs=self.jobs)
else:
self.CS_df[:] = pairwise_distances(self.stat_df,
metric=self.sim_algo,
n_jobs=self.jobs)
# self.cs_loop(dest_file)
del self.CS_df
self.CS_df = np.memmap(dest_file, dtype='float', mode='r', shape=(self.cols, self.cols))
print('Finished with {} calculations for {} for '
'w={}, lem={}, weighted={} at {}'.format(self.sim_algo,
self.corpus,
str(self.w),
self.lems,
self.weighted,
datetime.datetime.now().time().isoformat()))
def cs_loop(self, dest_file):
""" Divides self.stat_df into chunks more easily handled in memory
(the number of rows use at a time is determined in the step variable)
and then loops through all chunk combinations
:param dest_file: the file name to which to save the CS data
:type dest_file: str
"""
step = 5000
ind = self.cols
steps = []
x = step
while x < ind:
steps.append((x - step, x))
x += step
steps.append((steps[-1][-1], ind))
last_ind = steps[0]
for i1, i2 in combinations_with_replacement(steps, 2):
part1 = self.stat_df[i1[0]:i1[1]]
part2 = self.stat_df[i2[0]:i2[1]]
self.CS_df[i1[0]:i1[1], i2[0]:i2[1]] = 1- pairwise_distances(part1, part2, metric='cosine')
self.CS_df[i2[0]:i2[1], i1[0]:i1[1]] = self.CS_df[i1[0]:i1[1], i2[0]:i2[1]].T
if last_ind != i1:
os.system('echo CS {0}% done'.format((i1[0] / ind) * 100))
del self.CS_df
self.CS_df = np.memmap(dest_file, dtype='float', mode='r+', shape=(ind, ind))
last_ind = i1
'''for df_ind in steps:
part1 = self.stat_df[df_ind:min(df_ind + step, ind)]
for df_ind2 in steps2:
part2 = self.stat_df[df_ind2:min(df_ind2 + step, ind)]
self.CS_df[df_ind:min(df_ind + step, ind), df_ind2:min(df_ind2 + step, ind)] = 1- pairwise_distances(part1, part2, metric='cosine')
print('{0}% done'.format((df_ind / self.cols * 100)))
del self.CS_df
self.CS_df = np.memmap(dest_file, dtype='float', mode='r+', shape=(ind, ind))
'''
def stat_eval(self):
""" Guides the statistical significance calculations required by the parameters given in self.__init__
"""
print('Starting %s calculations for %s for '
'w=%s, lem=%s, weighted=%s at %s' %
(self.algo,
self.corpus,
str(self.w),
self.lems,
self.weighted,
datetime.datetime.now().time().isoformat()))
if self.algo == 'both':
print('Starting PPMI at {0}'.format(os.system('date')))
self.PPMI()
print('Starting LL at {0}'.format(os.system('date')))
self.LL()
elif self.algo == 'PPMI':
self.PPMI()
elif self.algo == 'LL':
self.LL()
del self.coll_df
print('Finished with %s calculations for %s for '
'w=%s, lem=%s, weighted=%s at %s' %
(self.algo,
self.corpus,
str(self.w),
self.lems,
self.weighted,
datetime.datetime.now().time().isoformat()))
def make_dest(self):
""" Constructs the name of the destination directory and creates the directory if needed
:ivar dest: the directory path into which the results will be saved
:type dest: str
:ivar corpus: the name of the corpus being analyzed
:type corpus: str
"""
self.dest = os.path.join(self.dir, str(self.w))
try:
os.mkdir(self.dest)
except:
pass
self.corpus = self.dir.split('/')[-1]
def runPipeline(self):
""" Guides the whole Pipeline process using the params given in self.__init__
"""
if not self.dir:
self.file_chooser()
self.make_dest()
print('Started analyzing %s at %s' %
(self.corpus,
datetime.datetime.now().time().isoformat()))
self.cooc_counter()
self.stat_eval()
if self.algo == 'both':
self.CS('PPMI')
self.CS('LL')
elif self.algo == 'PPMI':
self.CS('PPMI')
elif self.algo == 'LL':
self.CS('LL')
print('Finished at %s' % (datetime.datetime.now().time().isoformat()))
[docs]class ParamTester(SemPipeline):
""" Runs parameter testing for the corpus in question
the testing parameters are specified in the self.RunTests function
:param c: the number of cores to use in the co-occurrence calculations
:type c: int
:param jobs: the number of cores to use in the cosine similarity calculations
:type jobs: int
:param min_count: the minimum occurrence count. Words below this count will not be counted.
The purpose here is for memory management. My tests have shown that using all words produces better results.
:type min_count: int
:param files: the directory path for the .txt files that make up the corpus
:type files: str
:param stops: the stops words to be ignored in the calculations
:type stops: (str)
:param min_w: the minimum context window size to use
:type min_w: int
:param max_w: the maximum context window size to use
:type max_w: int
:param step: the size of the steps between min_w and max_w
:type step: int
:param lem_file: the path and filename for the word occurrence dictionary pickle
:type lem_file: str
:param w_tests: whether to use weighted ("True") or unweighted ("False") window types or "both"
:type w_tests: str
:param l_tests: whether to use word lemmas ("True") or inflected forms ("False") or "both"
:type l_tests: str
:param steps: the steps in the calculation process to perform. Allowed: 'all', 'coocs', 'LL', 'PPMI', 'LL_CS' (cosine similarity based on an existing Log-likelihood matrix), or 'PPMI_CS'.
:type steps: list
:ivar c: the number of cores to use in the co-occurrence calculations
:type c: int
:ivar stops: list of stop words to ignore during the calculations
:type stops: (str)
:ivar min_count: the minimum number of occurrences for a word to be used in the calculations
:type min_count: int
:ivar files: the directory path for the .txt files that make up the corpus
:type files: str
:ivar sim_algo: the similarity algorithm to use in the calculations
:type sim_algo: str
:ivar ind: the indices for the rows and columns of the matrix (i.e., the words) - filled in self.cooc_counter
:type ind: [str]
:ivar cols: the length of self.ind - filled in self.cooc_counter
:ivar cols: int
:ivar coll_df: transformed into numpy.memmap and filled in self.cooc_counter
:type coll_df: tuple
:ivar LL_df: transformed into numpy.memmap and filled in self.LL
:type LL_df: tuple
:ivar PPMI_df: transformed into numpy.memmap and filled in self.PPMI
:type PPMI_df: tuple
:ivar CS_df: transformed into numpy.memmap and filled in self.CS
:type CS_df: tuple
:ivar stat_df: filled with either self.PPMI_df or self.LL_df in self.CS
:type stat_df: tuple
:ivar param_dict: filled with the scores for each set of parameters in self.RunTests
:type param_dict: dict
"""
def __init__(self, min_w, max_w, step, c=8, jobs=1, min_count=1, files=None, stops=tuple(), lem_file=None,
w_tests='both', l_tests='both', steps=['all'], **kwargs):
self.c = c
self.stops = stops
self.jobs = jobs
self.min_count = min_count
self.dir = files
self.sim_algo = 'cosine'
self.min_w = min_w
self.max_w = max_w
self.step = step
# added for compatibility with SemPipeline
self.occ_dict = None
if isinstance(w_tests, str):
if w_tests == 'both':
self.w_tests = (True, False)
elif w_tests == 'True':
self.w_tests = [True]
elif w_tests == 'False':
self.w_tests = [False]
else:
self.w_tests = w_tests
if isinstance(l_tests, str):
if l_tests == 'both':
self.l_tests = (True, False)
elif l_tests == 'True':
self.l_tests = [True]
elif l_tests == 'False':
self.l_tests = [False]
else:
self.l_tests = l_tests
if lem_file == 'None':
self.lem_file = None
else:
self.lem_file = lem_file
self.do_coocs = False
self.do_LL = False
self.do_PPMI = False
self.do_LL_CS = False
self.do_PPMI_CS = False
self.do_all = False
self.remove = False
if 'all' in steps:
self.do_coocs = True
self.do_LL = True
self.do_PPMI = True
self.do_LL_CS = True
self.do_PPMI_CS = True
self.do_all = True
self.stat_algos = 'Both'
self.remove = True
else:
if 'coocs' in steps:
self.do_coocs = True
if 'LL' in steps:
self.do_LL = True
if 'PPMI' in steps:
self.do_PPMI = True
if 'LL_CS' in steps and 'PPMI_CS' in steps:
self.do_LL_CS = True
self.do_PPMI_CS = True
self.stat_algos = 'Both'
elif 'LL_CS' in steps:
self.do_LL_CS = True
self.stat_algos = 'LL'
elif 'PPMI_CS' in steps:
self.do_PPMI_CS = True
self.stat_algos = 'PPMI'
if 'remove' in steps:
self.remove = True
# the following ivars are filled later in the class
self.ind = []
self.cols = 0
self.coll_df = ()
self.LL_df = ()
self.PPMI_df = ()
self.CS_df = ()
self.stat_df = ()
self.param_dict = {}
def remove_dest(self):
""" Removes the destination folder of the file if self.remove is True
"""
shutil.rmtree(self.dest)
def RunTests(self):
""" Guides the parameter testing process
"""
from Chapter_2.LouwNidaCatSim import CatSimWin
for self.w in range(self.min_w, self.max_w + 1, self.step):
for self.weighted in self.w_tests:
for self.lems in self.l_tests:
print('weighted %s, lemmata %s, w=%s at %s' %
(self.weighted,
self.lems,
self.w,
datetime.datetime.now().time().isoformat()))
self.make_dest()
if self.do_coocs:
self.cooc_counter()
if not self.ind:
self.ind = pd.read_pickle(self.produce_file_names('Index').replace('.dat', '.pickle'))
self.cols = len(self.ind)
self.coll_df = np.memmap(self.produce_file_names('COOC'), dtype='float', mode='r', shape=(self.cols, self.cols))
if self.do_LL:
self.LL_df = self.LL()
del self.coll_df
if self.do_LL_CS:
if not self.LL_df:
self.LL_df = np.memmap(self.produce_file_names('LL'), dtype='float', mode='r', shape=(self.cols, self.cols))
pipe = CatSimWin('LL', [self.w],
lems=self.lems,
CS_dir=self.dir,
dest_dir='{}/Win_size_tests/LN'.format(self.dir),
sim_algo='cosine',
corpus=(self.dir.split('/')[-1], 1, 1.0, self.weighted),
lem_file=self.lem_file)
self.CS('LL')
pipe.df = self.CS_df
del self.CS_df
del self.LL_df
pipe.ind = self.ind
pipe.SimCalc(self.w)
pipe.AveCalc(self.w)
pipe.WriteFiles()
self.param_dict['LL_window={}_lems={}_weighted={}'.format(self.w, self.lems, self.weighted)] = pipe.ave_no_93[self.w]
del pipe
self.coll_df = np.memmap(self.produce_file_names('COOC'), dtype='float', mode='r', shape=(self.cols, self.cols))
if self.do_PPMI:
self.PPMI_df = self.PPMI()
del self.coll_df
if self.do_PPMI_CS:
if not self.PPMI_df:
self.PPMI_df = np.memmap(self.produce_file_names('PPMI'), dtype='float', mode='r', shape=(self.cols, self.cols))
pipe = CatSimWin('PPMI', [self.w],
lems=self.lems,
CS_dir=self.dir,
dest_dir='{}/Win_size_tests/LN'.format(self.dir),
sim_algo='cosine',
corpus=(self.dir.split('/')[-1], 1, 1.0, self.weighted),
lem_file=self.lem_file)
self.CS('PPMI')
pipe.df = self.CS_df
del self.PPMI_df
del self.CS_df
pipe.ind = self.ind
pipe.SimCalc(self.w)
pipe.AveCalc(self.w)
pipe.WriteFiles()
self.param_dict['PPMI_window={}_lems={}_weighted={}'.format(self.w, self.lems, self.weighted)] = pipe.ave_no_93[self.w]
del pipe
if self.remove:
self.remove_dest()
print(self.param_dict)
if self.do_LL_CS or self.do_PPMI_CS:
dest_file = '{0}/Win_size_tests/{1}_{2}_{3}_weighted={4}_lems={5}_algos={6}.pickle'.format(
self.dir, os.path.basename(self.dir), self.min_w, self.max_w, self.w_tests,
self.l_tests, self.stat_algos)
with open(dest_file, mode='wb') as f:
dump(self.param_dict, f)
with open(dest_file.replace('.pickle', '.csv'), mode='w') as f:
f.write('Test Details\tMean Category Score\tCategory Z-Score')
for k in sorted(self.param_dict.keys(),
key=lambda x: int(x.split('_')[1].split('=')[1])):
f.write('\n{}\t{}\t{}'.format(k, self.param_dict[k][0],
self.param_dict[k][1]))
def cmd():
parser = argparse.ArgumentParser(description='Pipeline for automatic extraction of semantic data.')
parser.add_argument('--win_size', type=int, default=10, help='The size of the contexts window')
parser.add_argument('--no_lems', dest='lemmata', action='store_false', help='Use a non-lemmatized corpus')
parser.add_argument('--lems', dest='lemmata', action='store_true', help='Use a lemmatized corpus')
parser.add_argument('--no_weight', dest='weighted', action='store_false', help='Use a non-weighted window type')
parser.add_argument('--weight', dest='weighted', action='store_true', help='Use a weighted window type')
parser.add_argument('--algo', type=str, default='LL', choices=['LL', 'PPMI'], help='The significance algorithm to use')
parser.add_argument('--files', type=str, help='The directory path in which the .txt files for your corpus are located.')
parser.add_argument('--c', type=int, default=1, help='The number of cores to use during co-occurrence calculations')
parser.add_argument('--occ_dict', type=str, help='The filepath to the file that contains the dictionary of word occurrences')
parser.add_argument('--min_count', type=int, default=1, help='The minimum number of occurrences for words to be considered in the calculations')
parser.add_argument('--jobs', type=int, default=1, help='The value for n_jobs in sklearn.metrics.pairwise_distances for cosine similarity calculations')
parser.add_argument('--no_stops', dest='stops', action='store_false', help='Ignore stop words')
parser.add_argument('--stops', dest='stops', action='store_true', help='Use stop words')
parser.set_defaults(lemmata=False, weighted=False, stops=True)
# Add subparsers for the whole process or for different steps
subparsers = parser.add_subparsers(dest='subparser_name')
parser_pipeline = subparsers.add_parser('SemPipeline')
parser_pipeline.set_defaults(func=SemPipeline)
parser_params = subparsers.add_parser('ParamTester')
parser_params.add_argument('--min_w', type=int, help='The minimum context window size to be tested')
parser_params.add_argument('--max_w', type=int, help='The maximum context window size to be tested')
parser_params.add_argument('--step', type=int, help='The size of the steps to test between min_w and max_w')
parser_params.add_argument('--w_tests', type=str, choices=['True', 'False', 'both'], help='Whether to test only the weighted window (True), the unweighted (False), or both (both)')
parser_params.add_argument('--l_tests', type=str, choices=['True', 'False', 'both'], help='Whether to test only the lemmatized text (True), the unlemmatized text (False), or both (both)')
parser_params.add_argument('--steps', type=str, default='all', choices=['all', 'coocs', 'LL', 'PPMI', 'LL_CS', 'PPMI_CS', 'remove'], help='The ParamTester functions to run')
parser_params.set_defaults(func=ParamTester)
args = parser.parse_args()
os.system('echo lems={} weighted={}'.format(args.lemmata, args.weighted))
if args.subparser_name == 'SemPipeline':
args.func(**vars(args)).runPipeline()
elif args.subparser_name == 'ParamTester':
args.func(**vars(args)).RunTests()
if __name__ == '__main__':
cmd()