Source code for Data_Production.sem_extract_pipeline

#! /usr/bin/env python3

__author__ = 'matt'

import sys
import os

PACKAGE_PARENT = '..'
SCRIPT_DIR = os.path.dirname(
    os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))))
sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))

import re
from collections import Counter
import datetime
from math import log, ceil

import pandas as pd
import numpy as np

try:
    from Data_Production.TK_files import tk_control
except ImportError:
    print('Tkinter cannot be used on this Python installation.\nPlease designate a list of files in the files variable.')
from sklearn.metrics.pairwise import pairwise_distances
from glob import glob
# from celery import group
# from proj.tasks import counter, svd_calc
from itertools import combinations_with_replacement
from pickle import dump
from multiprocessing import Pool
from Data_Production.multi_tasks import counter
import argparse
import shutil


[docs]class SemPipeline:
    """ This class produces matrices representing cooccurrence counts, statistical significance, and similarity data for a corpus

    :param win_size: context window size
    :type win_size: int
    :param lemmata: whether to use word lemmata
    :type lemmata: bool
    :param weighted: whether to use a weighted window type
    :type weighted: bool
    :param algo: the significance algorithm to use. 'LL' and 'PPMI' are implemented
    :type algo: str
    :param sim_algo: the similarity algorithm to use. 'CS' is implemented
    :type sim_algo: str
    :param files: the directory in which the individual .txt files are held
    :type files: str
    :param c: the number of cores to use in self.cooc_counter (will be removed in the future)
    :type c: int
    :param occ_dict: the path and filename for the occurrence dictionary pickle
    :type occ_dict: str
    :param min_count: the minimum occurrence count below which words will not be counted
    :type min_count: int
    :param jobs: number of jobs to use during the cosine similarity calculations
    :type jobs: int
    :param stops: whether to include stop words or not (True means to include them)
    :type stops: bool

    :ivar w: the context window size
    :type w: int
    :ivar lems: whether a lemmatized or unlemmatized text will be used
    :type lems: bool
    :ivar weighted: whether a weighted or unweighted context window will be used (True == weighted)
    :type weighted: bool
    :ivar algo: which significance algorithm will be used (PPMI or LL)
    :type algo: str
    :ivar sim_algo: the similarity algorithm to be used
    :type sim_algo: str
    :ivar dir: the directory path in which the texts are located
    :type dir: str
    :ivar c: the number of cores to use during co-occurrence counting
    :type c: int
    :ivar occ_dict: the location for the dictionary representing word counts for every word
    :type occ_dict: str
    :ivar min_count: the minimum threshold of occurrences for the words to be calculated
    :type min_count: int
    :ivar jobs: the value to be used for n_jobs in the cosine similarity calculations
    :type jobs: int
    :ivar stops: a list of stop-words to ignore during the calculations
    :type stops: (str)
    :ivar ind: the indices for the rows and columns of the matrix (i.e., the words) - filled in self.cooc_counter
    :type ind: [str]
    :ivar cols: the length of self.ind - filled in self.cooc_counter
    :ivar cols: int
    :ivar coll_df: transformed into numpy.memmap and filled in self.cooc_counter
    :type coll_df: tuple
    :ivar LL_df: transformed into numpy.memmap and filled in self.LL
    :type LL_df: tuple
    :ivar PPMI_df: transformed into numpy.memmap and filled in self.PPMI
    :type PPMI_df: tuple
    :ivar CS_df: transformed into numpy.memmap and filled in self.CS
    :type CS_df: tuple
    :ivar stat_df: filled with either self.PPMI_df or self.LL_df in self.CS
    :type stat_df: tuple
    :ivar dest: the destination directory for all files - filled in self.makeFileNames
    :type dest: str
    :ivar corpus: the name of the corpus under investigation - filled in self.makeFileNames
    :type corpus: str
    """

    def __init__(self, win_size=10, lemmata=True, weighted=True, algo='PPMI',
                 sim_algo='cosine', files=None, c=8, occ_dict=None,
                 min_count=1, jobs=1, stops=True, **kwargs):
        """ This class produces matrices representing cooccurrence counts, statistical significance, and similarity data for a corpus

        :param win_size: context window size
        :type win_size: int
        :param lemmata: whether to use word lemmata
        :type lemmata: bool
        :param weighted: whether to use a weighted window type
        :type weighted: bool
        :param algo: the significance algorithm to use. 'LL' and 'PPMI' are implemented
        :type algo: str
        :param sim_algo: the similarity algorithm to use. 'CS' is implemented
        :type sim_algo: str
        :param files: the directory in which the individual .txt files are held
        :type files: str
        :param c: the number of cores to use in self.cooc_counter (will be removed in the future)
        :type c: int
        :param occ_dict: the path and filename for the occurrence dictionary pickle
        :type occ_dict: str
        :param min_count: the minimum occurrence count below which words will not be counted
        :type min_count: int
        :param jobs: number of jobs to use during the cosine similarity calculations
        :type jobs: int
        :param stops: whether to include stop words or not (True means to include them)
        :type stops: bool

        :ivar w: the context window size
        :type w: int
        :ivar lems: whether a lemmatized or unlemmatized text will be used
        :type lems: bool
        :ivar weighted: whether a weighted or unweighted context window will be used (True == weighted)
        :type weighted: bool
        :ivar algo: which significance algorithm will be used (PPMI or LL)
        :type algo: str
        :ivar sim_algo: the similarity algorithm to be used
        :type sim_algo: str
        :ivar dir: the directory path in which the texts are located
        :type dir: str
        :ivar c: the number of cores to use during co-occurrence counting
        :type c: int
        :ivar occ_dict: the location for the dictionary representing word counts for every word
        :type occ_dict: str
        :ivar min_count: the minimum threshold of occurrences for the words to be calculated
        :type min_count: int
        :ivar jobs: the value to be used for n_jobs in the cosine similarity calculations
        :type jobs: int
        :ivar stops: a list of stop-words to ignore during the calculations
        :type stops: (str)
        :ivar ind: the indices for the rows and columns of the matrix (i.e., the words) - filled in self.cooc_counter
        :type ind: [str]
        :ivar cols: the length of self.ind - filled in self.cooc_counter
        :ivar cols: int
        :ivar coll_df: transformed into numpy.memmap and filled in self.cooc_counter
        :type coll_df: np.memmap
        :ivar LL_df: transformed into numpy.memmap and filled in self.LL
        :type LL_df: np.memmap
        :ivar PPMI_df: transformed into numpy.memmap and filled in self.PPMI
        :type PPMI_df: np.memmap
        :ivar CS_df: transformed into numpy.memmap and filled in self.CS
        :type CS_df: np.memmap
        :ivar stat_df: filled with either self.PPMI_df or self.LL_df in self.CS
        :type stat_df: np.memmap
        :ivar dest: the destination directory for all files - filled in self.makeFileNames
        :type dest: str
        :ivar corpus: the name of the corpus under investigation - filled in self.makeFileNames
        :type corpus: str
        """
        self.w = win_size
        self.lems = lemmata
        self.weighted = weighted
        self.algo = algo
        if sim_algo in ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
                        'manhattan']:
            self.sim_algo = sim_algo
        else:
            print(
                "The only accepted values for 'sim_algo' are 'cityblock', 'cosine', 'euclidean', 'l1', 'l2', or 'manhattan'")
            print("Setting 'sim_algo' to 'cosine'")
            self.sim_algo = 'cosine'
        if self.algo not in ['PPMI', 'LL', 'both']:
            print(
                'The only accepted values for "algo" are "PPMI", "LL", or "both".')
            print("Setting 'algo' to 'both'")
            self.algo = 'both'
        self.dir = files
        self.c = c
        if occ_dict == 'None':
            self.occ_dict = None
        else:
            self.occ_dict = occ_dict
        if min_count == 'None':
            self.min_count = None
        else:
            self.min_count = min_count
        self.jobs = jobs
        if not stops:
            self.stops = ('μή', 'ἑαυτοῦ', 'ἄν', 'ἀλλ’', 'ἀλλά', 'ἄλλος', 'ἀπό',
                          'ἄρα', 'αὐτός', 'δ’', 'δέ', 'δή', 'διά', 'δαί',
                          'δαίς', 'ἔτι', 'ἐγώ', 'ἐκ', 'ἐμός', 'ἐν', 'ἐπί',
                          'εἰ', 'εἰμί', 'εἴμι', 'εἰς', 'γάρ', 'γε', 'γα^', 'ἡ',
                          'ἤ', 'καί', 'κατά', 'μέν', 'μετά', 'μή', 'ὁ', 'ὅδε',
                          'ὅς', 'ὅστις', 'ὅτι', 'οὕτως', 'οὗτος', 'οὔτε',
                          'οὖν',
                          'οὐδείς', 'οἱ', 'οὐ', 'οὐδέ', 'οὐκ', 'περί', 'πρός',
                          'σύ', 'σύν', 'τά', 'τε', 'τήν', 'τῆς', 'τῇ', 'τι',
                          'τί', 'τις', 'τίς', 'τό', 'τοί', 'τοιοῦτος', 'τόν',
                          'τούς', 'τοῦ', 'τῶν', 'τῷ', 'ὑμός', 'ὑπέρ', 'ὑπό',
                          'ὡς', 'ὦ', 'ὥστε', 'ἐάν', 'παρά', 'σός')
        else:
            self.stops = ()

        # the following ivars are filled later in the class
        self.ind = []
        self.cols = 0
        self.coll_df = ()
        self.LL_df = ()
        self.PPMI_df = ()
        self.CS_df = ()
        self.stat_df = ()
        self.dest = ''
        self.corpus = ''

    def file_chooser(self):
        """ Uses tkinter.filedialog, as implemented in the tk_control class to fill self.dir if files=None

        """
        self.dir = tk_control("askdirectory(title='In which directory are the XML file(s) would you like to analyze?')")

    def produce_file_names(self, step):
        """

        :param step: The step of the process, used to generate the correct file name
        :type step: str
        :return: The file name
        :rtype: str
        """
        return '{dest}/{step}_{win}_lems={lems}_{corpus}_min_occ={min_count}_no_stops={stops}_weighted={weighted}.dat'.format(
            dest=self.dest, step=step, win=str(self.w), lems=self.lems, corpus=self.corpus, min_count=self.min_count,
            stops=bool(self.stops), weighted=self.weighted)

    def word_extract(self, text, pattern, stops=()):
        """ Extracts a list of words from self.t

        :return: list of words
        :rtype: list
        """
        words = []
        for line in text:
            word = re.sub(pattern, r'\1', line)
            if word != '' and word not in stops:
                words.append(word)
        return words

    def word_counter(self, words, counts, min_lems=set()):
        """

        :param words:
        :type words: list
        :param min_lems:
        :type min_lems: set
        :return: the co-occurrence counts for the files
        :rtype: Counter
        """
        step = ceil(len(words) / self.c)
        steps = []
        for i in range(self.c):
            steps.append((step * i, min(step * (i + 1), len(words))))
        '''self.res = group(
            counter.s(self.weighted, self.w, words, limits) for limits in
            steps)().get()
        '''
        # res = []
        with Pool(processes=self.c) as pool:
            # for limits in steps:
            results = pool.starmap(counter, [(self.weighted, self.w, words, limits) for limits in steps])
            # res.append(results.get())
        # since the counter task returns Counter objects, the update method
        # below adds instead of replacing the values
        for r in results:
            for key in r.keys():
                if key not in min_lems:
                    if key in counts.keys():
                        counts[key].update(r[key])
                    else:
                        counts[key] = r[key]
        return counts

    def cooc_counter(self):
        """ Counts the number of times each word co-occurs with each other word

        :ivar ind: the words that represent the ordered indices of all matrices produced in later calculation
        :type ind: [int]
        :ivar coll_df: self.coll_df
        :type coll_df: numpy.memmap
        """
        cooc_dest = self.produce_file_names('COOC')
        index_dest = self.produce_file_names('Index').replace('.dat', '.pickle')
        # Check to see if a cooccurrence file already exists, if so, exit the method
        if os.path.isfile(cooc_dest):
            self.ind = pd.read_pickle(index_dest)
            self.cols = len(self.ind)
            self.coll_df = np.memmap(cooc_dest, dtype='float', mode='r',
                                     shape=(len(self.ind), self.cols))
            return

        # Initialize local variables
        counts = Counter()
        if self.lems:
            pattern = re.compile(r'.+?lem="([^"]*).*')
        else:
            pattern = re.compile(r'.+?>([^<]*).*')
        if self.occ_dict:
            occs = pd.read_pickle(self.occ_dict)
            min_lems = set([w for w in occs if occs[w] < self.min_count])
            del occs
        else:
            min_lems = set()

        # Compute co-occurrence counts for each file in self.dir
        for file in glob('{0}/*.txt'.format(self.dir)):
            with open(file) as f:
                words = self.word_extract(f.read().lower().split('\n'), pattern, self.stops)
            counts = self.word_counter(words, counts, min_lems)


        # Fill the ivars that come from the co-occurrence counts
        self.ind = list(counts.keys())
        self.cols = len(self.ind)

        # Write the counts dictionary to a numpy.memmap file
        print('Now writing cooccurrence file at {0}'.format(datetime.datetime.now().time().isoformat()))
        self.coll_df = np.memmap(cooc_dest, dtype='float', mode='w+', shape=(self.cols, self.cols))
        for i, w in enumerate(self.ind):
            s = pd.Series(counts[w], index=self.ind,
                          dtype=np.float64).fillna(0)
            self.coll_df[i] = s.values
            if i % 5000 == 0:
                os.system('echo COOC {0}% done'.format((i / self.cols * 100)))
                del self.coll_df
                self.coll_df = np.memmap(cooc_dest, dtype='float', mode='r+',
                                         shape=(
                                             self.cols, self.cols))

        # Re-open self.coll_df as read-only
        del self.coll_df
        self.coll_df = np.memmap(cooc_dest, dtype='float', mode='r', shape=(self.cols, self.cols))

        # Save the index list and the column list
        with open(index_dest, mode='wb') as f:
            dump(self.ind, f)

    def log_L(self, k, n, x):
        """ Calculates the values for the individual elements of the Log-likelihood equation using the
        binomial distribution function L(k,n,x) = (x**k)*(1-x)**(n-k).

        :param k:
        :type k: pandas.Series
        :param n:
        :type n: numpy.ndarray
        :param x:
        :type x: numpy.ndarray
        :return: Log-likelihood values
        :rtype: numpy.ndarray
        """
        return np.log(np.power(np.float64(x), k)
                      * np.power(np.float64(1 - x), n - k))

    def log_space_L(self, k, n, x):
        """ Calculates the values for the individual elements of the Log-likelihood equation using the
        binomial distribution function L(k,n,x) = (x**k)*(1-x)**(n-k).
        Moves the calculations to log-space to deal with floats that are too small for float64.

        :param k:
        :type k: numpy.ndarray
        :param n:
        :type n: numpy.ndarray
        :param x:
        :type x: numpy.ndarray
        :return: Log-likelihood values
        :rtype: numpy.ndarray
        """
        return np.log(x) * (k) + (np.log(1 - x) * (n - k))

    def log_like(self, row, C2, P, N):
        """ Guides the process of Log-likelihood calculations for a single row

        :param row: the index of the row in the table to be calculated
        :type row: int
        :param C2: number of co-occurrences for each row of the table
        :type C2: numpy.ndarray
        :param P: ratio of co-occurrences per row to total co-occurrences in the table
        :type P: numpy.ndarray
        :param N: total number of co-occurrences in the table
        :type N: float
        :return: Log-likelihood values for a single row in the table
        :rtype: numpy.ndarray
        """
        C12 = self.coll_df[row]
        C1 = np.sum(C12)
        # P1 is ratio of single co-occurrence values to the total co-occurrences for that row
        P1 = C12 / C1
        # P2 ratio of total co-occurrences for a word minus the co-occurrences
        # with the word in question to the total number of co-occurrences in
        # the table minus the total co-occurrences for the row.
        P2 = (C2 - C12) / (N - C1)

        LL1 = self.log_space_L(C12, C1, P)
        LL2 = self.log_space_L(C2 - C12, N - C1, P)
        LL3 = self.log_L(C12, C1, P1)

        # The following finds all inf and -inf values in LL3 by moving calculations into log space.

        LL3_inf = np.where(abs(LL3) == np.inf)
        if len(LL3_inf) > 0:
            for ind in LL3_inf[0]:
                try:
                    LL3[ind] = (log(P1[ind]) * C12[ind]) + (
                        log(1 - P1[ind]) * (C1 - C12[ind]))
                except ValueError:
                    LL3[ind] = 0

        LL4 = self.log_space_L(C2 - C12, N - C1, P2)

        # The following finds all inf and -inf values in LL4 by moving calculations into log space.

        LL4_inf = np.where(abs(LL4) == np.inf)
        if len(LL4_inf) > 0:
            for ind in LL4_inf[0]:
                try:
                    LL4[ind] = self.log_L((C2[ind] - C12[ind]), (N - C1),
                                          P2[ind])
                except ValueError:
                    LL4[ind] = 0

        a = -2 * (LL1 + LL2 - LL3 - LL4)
        # a[np.where(np.isfinite(a) == False)] = 0
        return a

    def LL(self):
        """ Guides the Log-likelihood calculations for the whole matrix

        :ivar LL_df: matrix of log-likelihood values
        :type LL_df: numpy.memmap
        """
        dest_file = self.produce_file_names('LL')

        # If a log-likelihood file exists already for these parameters, exit the method
        if os.path.isfile(dest_file):
            self.LL_df = np.memmap(dest_file, dtype='float', mode='r',
                                   shape=(self.cols, self.cols))
            return

        # Initialize local variables
        n = np.sum(self.coll_df)
        c2 = np.sum(self.coll_df, axis=0)
        p = c2 / n

        # Fill self.LL_df with log-likelihood values
        self.LL_df = np.memmap(dest_file, dtype='float', mode='w+',
                               shape=(self.cols, self.cols))
        for i in range(self.cols):
            self.LL_df[i] = self.log_like(i, c2, p, n)
            if i % 5000 == 0:
                os.system('echo LL {0}% done'.format((i / self.cols * 100)))
                del self.LL_df
                self.LL_df = np.memmap(dest_file, dtype='float', mode='r+',
                                       shape=(self.cols, self.cols))

        # Change all numpy.nan and numpy.inf values to 0
        # This is necessary for later calculations that will raise errors for non-finite values
        #self.LL_df[np.where(np.isfinite(self.LL_df) == False)] = 0

        # Dump memory and reload self.LL_df as read-only
        del self.LL_df
        self.LL_df = np.memmap(dest_file, dtype='float', mode='r', shape=(self.cols, self.cols))

    def PMI_calc(self, row, P2, N):
        """ Calculates PPMI values for one table row

        :param row: index for the word's row in the table
        :type row: int
        :param P2: ratio of co-occurrences per row to total co-occurrences in the table
        :type P2: Numpy ndarray
        :param N: total co-occurrences in the table
        :type N: float
        :return: PPMI values for a row in the table
        :rtype: Numpy ndarray
        """
        C12 = self.coll_df[row]
        # C1 is the total co-occurrences in the row
        C1 = np.sum(C12)
        # P1 is the probability that the word co-occurs
        P1 = C1 / N
        # P12 is a vector of the probabilities that the word occurs with any other word
        P12 = C12 / N
        a = np.log2(np.divide(P12, P1 * P2))
        a[np.where(np.isfinite(a) == False)] = 0
        a[a < 0] = 0
        return a

    def PPMI(self):
        """ Guides the PPMI calculation process for the whole table

        :ivar PPMI_df: matrix of PPMI values
        :type PPMI_df: numpy.memmap
        """
        dest_file = self.produce_file_names('PPMI')

        # If a PPMI file already exists, exit the method
        if os.path.isfile(dest_file):
            self.PPMI_df = np.memmap(dest_file, dtype='float', mode='r',
                                     shape=(self.cols, self.cols))
            return

        # Initialize local variables
        n = np.sum(self.coll_df)
        #values for C2
        p2 = np.sum(self.coll_df, axis=0) / n

        # Fill self.PPMI_df with values
        self.PPMI_df = np.memmap(dest_file, dtype='float', mode='w+', shape=(self.cols, self.cols))
        for i, w in enumerate(self.ind):
            self.PPMI_df[i] = self.PMI_calc(i, p2, n)
            if i % 5000 == 0:
                os.system('echo PPMI {0}% done'.format((i / self.cols * 100)))
                del self.PPMI_df
                self.PPMI_df = np.memmap(dest_file, dtype='float', mode='r+',
                                         shape=(self.cols, self.cols))

        # Change all numpy.nan and numpy.inf values to 0
        # This is necessary for later calculations that will raise errors for non-finite values
        #self.PPMI_df[np.where(np.isfinite(self.PPMI_df) == False)] = 0

        # Dump memory and reload self.PPMI_df as read-only
        del self.PPMI_df
        self.PPMI_df = np.memmap(dest_file, dtype='float', mode='r',
                                 shape=(self.cols, self.cols))

    def CS(self, algorithm):
        """ Calculates the cosine similarity of every matrix row with every other row

        :param algorithm: which algorithm (PPMI or LL) is being tested
        :type algorithm: str
        :param e: SVD exponent
        :type e: float
        :ivar CS_df: matrix of cosine similarity values
        :type CS_df: numpy.memmap
        """
        print('Starting {} calculations for {} for '
              'w={}, lem={}, weighted={} at {}'.format(self.sim_algo,
                                                       self.corpus,
                                                       str(self.w),
                                                       self.lems,
                                                       self.weighted,
                                                       datetime.datetime.now().time().isoformat()))
        dest_file = self.produce_file_names('CS_{}'.format(algorithm))
        if os.path.isfile(dest_file):
            return
        if algorithm == 'PPMI':
            self.stat_df = self.PPMI_df
            self.stat_file = self.produce_file_names('PPMI')
        elif algorithm == 'LL':
            self.stat_df = self.LL_df
            self.stat_file = self.produce_file_names('LL')

        self.CS_df = np.memmap(dest_file, dtype='float', mode='w+',
                               shape=(self.cols, self.cols))
        if self.sim_algo == 'cosine':
            self.CS_df[:] = 1 - pairwise_distances(self.stat_df,
                                                   metric=self.sim_algo,
                                                   n_jobs=self.jobs)
        else:
            self.CS_df[:] = pairwise_distances(self.stat_df,
                                               metric=self.sim_algo,
                                               n_jobs=self.jobs)
        # self.cs_loop(dest_file)
        del self.CS_df
        self.CS_df = np.memmap(dest_file, dtype='float', mode='r', shape=(self.cols, self.cols))
        print('Finished with {} calculations for {} for '
              'w={}, lem={}, weighted={} at {}'.format(self.sim_algo,
                                                       self.corpus,
                                                       str(self.w),
                                                       self.lems,
                                                       self.weighted,
                                                       datetime.datetime.now().time().isoformat()))

    def cs_loop(self, dest_file):
        """ Divides self.stat_df into chunks more easily handled in memory
        (the number of rows use at a time is determined in the step variable)
        and then loops through all chunk combinations

        :param dest_file: the file name to which to save the CS data
        :type dest_file: str
        """
        step = 5000
        ind = self.cols
        steps = []
        x = step
        while x < ind:
            steps.append((x - step, x))
            x += step
        steps.append((steps[-1][-1], ind))
        last_ind = steps[0]

        for i1, i2 in combinations_with_replacement(steps, 2):
            part1 = self.stat_df[i1[0]:i1[1]]
            part2 = self.stat_df[i2[0]:i2[1]]
            self.CS_df[i1[0]:i1[1], i2[0]:i2[1]] = 1- pairwise_distances(part1, part2, metric='cosine')
            self.CS_df[i2[0]:i2[1], i1[0]:i1[1]] = self.CS_df[i1[0]:i1[1], i2[0]:i2[1]].T
            if last_ind != i1:
                os.system('echo CS {0}% done'.format((i1[0] / ind) * 100))
                del self.CS_df
                self.CS_df = np.memmap(dest_file, dtype='float', mode='r+', shape=(ind, ind))
            last_ind = i1
        '''for df_ind in steps:
            part1 = self.stat_df[df_ind:min(df_ind + step, ind)]
            for df_ind2 in steps2:
                part2 = self.stat_df[df_ind2:min(df_ind2 + step, ind)]
                self.CS_df[df_ind:min(df_ind + step, ind), df_ind2:min(df_ind2 + step, ind)] = 1- pairwise_distances(part1, part2, metric='cosine')
            print('{0}% done'.format((df_ind / self.cols * 100)))
            del self.CS_df
            self.CS_df = np.memmap(dest_file, dtype='float', mode='r+', shape=(ind, ind))
        '''

    def stat_eval(self):
        """ Guides the statistical significance calculations required by the parameters given in self.__init__

        """
        print('Starting %s calculations for %s for '
              'w=%s, lem=%s, weighted=%s at %s' %
              (self.algo,
               self.corpus,
               str(self.w),
               self.lems,
               self.weighted,
               datetime.datetime.now().time().isoformat()))
        if self.algo == 'both':
            print('Starting PPMI at {0}'.format(os.system('date')))
            self.PPMI()
            print('Starting LL at {0}'.format(os.system('date')))
            self.LL()
        elif self.algo == 'PPMI':
            self.PPMI()
        elif self.algo == 'LL':
            self.LL()
        del self.coll_df
        print('Finished with %s calculations for %s for '
              'w=%s, lem=%s, weighted=%s at %s' %
              (self.algo,
               self.corpus,
               str(self.w),
               self.lems,
               self.weighted,
               datetime.datetime.now().time().isoformat()))

    def make_dest(self):
        """ Constructs the name of the destination directory and creates the directory if needed

        :ivar dest: the directory path into which the results will be saved
        :type dest: str
        :ivar corpus: the name of the corpus being analyzed
        :type corpus: str
        """
        self.dest = os.path.join(self.dir, str(self.w))
        try:
            os.mkdir(self.dest)
        except:
            pass
        self.corpus = self.dir.split('/')[-1]

    def runPipeline(self):
        """ Guides the whole Pipeline process using the params given in self.__init__

        """
        if not self.dir:
            self.file_chooser()
        self.make_dest()
        print('Started analyzing %s at %s' %
              (self.corpus,
               datetime.datetime.now().time().isoformat()))
        self.cooc_counter()
        self.stat_eval()
        if self.algo == 'both':
            self.CS('PPMI')
            self.CS('LL')
        elif self.algo == 'PPMI':
            self.CS('PPMI')
        elif self.algo == 'LL':
            self.CS('LL')

        print('Finished at %s' % (datetime.datetime.now().time().isoformat()))


[docs]class ParamTester(SemPipeline):
    """ Runs parameter testing for the corpus in question
    the testing parameters are specified in the self.RunTests function

    :param c: the number of cores to use in the co-occurrence calculations
    :type c: int
    :param jobs: the number of cores to use in the cosine similarity calculations
    :type jobs: int
    :param min_count: the minimum occurrence count. Words below this count will not be counted.
        The purpose here is for memory management. My tests have shown that using all words produces better results.
    :type min_count: int
    :param files: the directory path for the .txt files that make up the corpus
    :type files: str
    :param stops: the stops words to be ignored in the calculations
    :type stops: (str)
    :param min_w: the minimum context window size to use
    :type min_w: int
    :param max_w: the maximum context window size to use
    :type max_w: int
    :param step: the size of the steps between min_w and max_w
    :type step: int
    :param lem_file: the path and filename for the word occurrence dictionary pickle
    :type lem_file: str
    :param w_tests: whether to use weighted ("True") or unweighted ("False") window types or "both"
    :type w_tests: str
    :param l_tests: whether to use word lemmas ("True") or inflected forms ("False") or "both"
    :type l_tests: str
    :param steps: the steps in the calculation process to perform. Allowed: 'all', 'coocs', 'LL', 'PPMI', 'LL_CS' (cosine similarity based on an existing Log-likelihood matrix), or 'PPMI_CS'.
    :type steps: list

    :ivar c: the number of cores to use in the co-occurrence calculations
    :type c: int
    :ivar stops: list of stop words to ignore during the calculations
    :type stops: (str)
    :ivar min_count: the minimum number of occurrences for a word to be used in the calculations
    :type min_count: int
    :ivar files: the directory path for the .txt files that make up the corpus
    :type files: str
    :ivar sim_algo: the similarity algorithm to use in the calculations
    :type sim_algo: str
    :ivar ind: the indices for the rows and columns of the matrix (i.e., the words) - filled in self.cooc_counter
    :type ind: [str]
    :ivar cols: the length of self.ind - filled in self.cooc_counter
    :ivar cols: int
    :ivar coll_df: transformed into numpy.memmap and filled in self.cooc_counter
    :type coll_df: tuple
    :ivar LL_df: transformed into numpy.memmap and filled in self.LL
    :type LL_df: tuple
    :ivar PPMI_df: transformed into numpy.memmap and filled in self.PPMI
    :type PPMI_df: tuple
    :ivar CS_df: transformed into numpy.memmap and filled in self.CS
    :type CS_df: tuple
    :ivar stat_df: filled with either self.PPMI_df or self.LL_df in self.CS
    :type stat_df: tuple
    :ivar param_dict: filled with the scores for each set of parameters in self.RunTests
    :type param_dict: dict
    """

    def __init__(self, min_w, max_w, step, c=8, jobs=1, min_count=1, files=None, stops=tuple(), lem_file=None,
                 w_tests='both', l_tests='both', steps=['all'], **kwargs):
        self.c = c
        self.stops = stops
        self.jobs = jobs
        self.min_count = min_count
        self.dir = files
        self.sim_algo = 'cosine'
        self.min_w = min_w
        self.max_w = max_w
        self.step = step
        # added for compatibility with SemPipeline
        self.occ_dict = None
        if isinstance(w_tests, str):
            if w_tests == 'both':
                self.w_tests = (True, False)
            elif w_tests == 'True':
                self.w_tests = [True]
            elif w_tests == 'False':
                self.w_tests = [False]
        else:
            self.w_tests = w_tests
        if isinstance(l_tests, str):
            if l_tests == 'both':
                self.l_tests = (True, False)
            elif l_tests == 'True':
                self.l_tests = [True]
            elif l_tests == 'False':
                self.l_tests = [False]
        else:
            self.l_tests = l_tests
        if lem_file == 'None':
            self.lem_file = None
        else:
            self.lem_file = lem_file
        self.do_coocs = False
        self.do_LL = False
        self.do_PPMI = False
        self.do_LL_CS = False
        self.do_PPMI_CS = False
        self.do_all = False
        self.remove = False
        if 'all' in steps:
            self.do_coocs = True
            self.do_LL = True
            self.do_PPMI = True
            self.do_LL_CS = True
            self.do_PPMI_CS = True
            self.do_all = True
            self.stat_algos = 'Both'
            self.remove = True
        else:
            if 'coocs' in steps:
                self.do_coocs = True
            if 'LL' in steps:
                self.do_LL = True
            if 'PPMI' in steps:
                self.do_PPMI = True
            if 'LL_CS' in steps and 'PPMI_CS' in steps:
                self.do_LL_CS = True
                self.do_PPMI_CS = True
                self.stat_algos = 'Both'
            elif 'LL_CS' in steps:
                    self.do_LL_CS = True
                    self.stat_algos = 'LL'
            elif 'PPMI_CS' in steps:
                self.do_PPMI_CS = True
                self.stat_algos = 'PPMI'
            if 'remove' in steps:
                self.remove = True

        # the following ivars are filled later in the class
        self.ind = []
        self.cols = 0
        self.coll_df = ()
        self.LL_df = ()
        self.PPMI_df = ()
        self.CS_df = ()
        self.stat_df = ()
        self.param_dict = {}

    def remove_dest(self):
        """ Removes the destination folder of the file if self.remove is True
        """
        shutil.rmtree(self.dest)

    def RunTests(self):
        """ Guides the parameter testing process
        """
        from Chapter_2.LouwNidaCatSim import CatSimWin


        for self.w in range(self.min_w, self.max_w + 1, self.step):
            for self.weighted in self.w_tests:
                for self.lems in self.l_tests:
                    print('weighted %s, lemmata %s, w=%s at %s' %
                          (self.weighted,
                           self.lems,
                           self.w,
                           datetime.datetime.now().time().isoformat()))
                    self.make_dest()
                    if self.do_coocs:
                        self.cooc_counter()
                    if not self.ind:
                        self.ind = pd.read_pickle(self.produce_file_names('Index').replace('.dat', '.pickle'))
                        self.cols = len(self.ind)
                    self.coll_df = np.memmap(self.produce_file_names('COOC'), dtype='float', mode='r', shape=(self.cols, self.cols))
                    if self.do_LL:
                        self.LL_df = self.LL()
                    del self.coll_df
                    if self.do_LL_CS:
                        if not self.LL_df:
                            self.LL_df = np.memmap(self.produce_file_names('LL'), dtype='float', mode='r', shape=(self.cols, self.cols))
                        pipe = CatSimWin('LL', [self.w],
                                         lems=self.lems,
                                         CS_dir=self.dir,
                                         dest_dir='{}/Win_size_tests/LN'.format(self.dir),
                                         sim_algo='cosine',
                                         corpus=(self.dir.split('/')[-1], 1, 1.0, self.weighted),
                                         lem_file=self.lem_file)
                        self.CS('LL')
                        pipe.df = self.CS_df
                        del self.CS_df
                        del self.LL_df
                        pipe.ind = self.ind
                        pipe.SimCalc(self.w)
                        pipe.AveCalc(self.w)
                        pipe.WriteFiles()
                        self.param_dict['LL_window={}_lems={}_weighted={}'.format(self.w, self.lems, self.weighted)] = pipe.ave_no_93[self.w]
                        del pipe
                    self.coll_df = np.memmap(self.produce_file_names('COOC'), dtype='float', mode='r', shape=(self.cols, self.cols))
                    if self.do_PPMI:
                        self.PPMI_df = self.PPMI()
                    del self.coll_df
                    if self.do_PPMI_CS:
                        if not self.PPMI_df:
                            self.PPMI_df = np.memmap(self.produce_file_names('PPMI'), dtype='float', mode='r', shape=(self.cols, self.cols))
                        pipe = CatSimWin('PPMI', [self.w],
                                         lems=self.lems,
                                         CS_dir=self.dir,
                                         dest_dir='{}/Win_size_tests/LN'.format(self.dir),
                                         sim_algo='cosine',
                                         corpus=(self.dir.split('/')[-1], 1, 1.0, self.weighted),
                                         lem_file=self.lem_file)
                        self.CS('PPMI')
                        pipe.df = self.CS_df
                        del self.PPMI_df
                        del self.CS_df
                        pipe.ind = self.ind
                        pipe.SimCalc(self.w)
                        pipe.AveCalc(self.w)
                        pipe.WriteFiles()
                        self.param_dict['PPMI_window={}_lems={}_weighted={}'.format(self.w, self.lems, self.weighted)] = pipe.ave_no_93[self.w]
                        del pipe
                    if self.remove:
                        self.remove_dest()
            print(self.param_dict)
        if self.do_LL_CS or self.do_PPMI_CS:
            dest_file = '{0}/Win_size_tests/{1}_{2}_{3}_weighted={4}_lems={5}_algos={6}.pickle'.format(
                self.dir, os.path.basename(self.dir), self.min_w, self.max_w, self.w_tests,
                self.l_tests, self.stat_algos)
            with open(dest_file, mode='wb') as f:
                dump(self.param_dict, f)
            with open(dest_file.replace('.pickle', '.csv'), mode='w') as f:
                f.write('Test Details\tMean Category Score\tCategory Z-Score')
                for k in sorted(self.param_dict.keys(),
                                key=lambda x: int(x.split('_')[1].split('=')[1])):
                    f.write('\n{}\t{}\t{}'.format(k, self.param_dict[k][0],
                                                  self.param_dict[k][1]))


def cmd():
    parser = argparse.ArgumentParser(description='Pipeline for automatic extraction of semantic data.')
    parser.add_argument('--win_size', type=int, default=10, help='The size of the contexts window')
    parser.add_argument('--no_lems', dest='lemmata', action='store_false', help='Use a non-lemmatized corpus')
    parser.add_argument('--lems', dest='lemmata', action='store_true', help='Use a lemmatized corpus')
    parser.add_argument('--no_weight', dest='weighted', action='store_false', help='Use a non-weighted window type')
    parser.add_argument('--weight', dest='weighted', action='store_true', help='Use a weighted window type')
    parser.add_argument('--algo', type=str, default='LL', choices=['LL', 'PPMI'], help='The significance algorithm to use')
    parser.add_argument('--files', type=str, help='The directory path in which the .txt files for your corpus are located.')
    parser.add_argument('--c', type=int, default=1, help='The number of cores to use during co-occurrence calculations')
    parser.add_argument('--occ_dict', type=str, help='The filepath to the file that contains the dictionary of word occurrences')
    parser.add_argument('--min_count', type=int, default=1, help='The minimum number of occurrences for words to be considered in the calculations')
    parser.add_argument('--jobs', type=int, default=1, help='The value for n_jobs in sklearn.metrics.pairwise_distances for cosine similarity calculations')
    parser.add_argument('--no_stops', dest='stops', action='store_false', help='Ignore stop words')
    parser.add_argument('--stops', dest='stops', action='store_true', help='Use stop words')
    parser.set_defaults(lemmata=False, weighted=False, stops=True)

    # Add subparsers for the whole process or for different steps
    subparsers = parser.add_subparsers(dest='subparser_name')
    parser_pipeline = subparsers.add_parser('SemPipeline')
    parser_pipeline.set_defaults(func=SemPipeline)
    parser_params = subparsers.add_parser('ParamTester')
    parser_params.add_argument('--min_w', type=int, help='The minimum context window size to be tested')
    parser_params.add_argument('--max_w', type=int, help='The maximum context window size to be tested')
    parser_params.add_argument('--step', type=int, help='The size of the steps to test between min_w and max_w')
    parser_params.add_argument('--w_tests', type=str, choices=['True', 'False', 'both'], help='Whether to test only the weighted window (True), the unweighted (False), or both (both)')
    parser_params.add_argument('--l_tests', type=str, choices=['True', 'False', 'both'], help='Whether to test only the lemmatized text (True), the unlemmatized text (False), or both (both)')
    parser_params.add_argument('--steps', type=str, default='all', choices=['all', 'coocs', 'LL', 'PPMI', 'LL_CS', 'PPMI_CS', 'remove'], help='The ParamTester functions to run')
    parser_params.set_defaults(func=ParamTester)
    args = parser.parse_args()
    os.system('echo lems={} weighted={}'.format(args.lemmata, args.weighted))
    if args.subparser_name == 'SemPipeline':
        args.func(**vars(args)).runPipeline()
    elif args.subparser_name == 'ParamTester':
        args.func(**vars(args)).RunTests()

if __name__ == '__main__':
    cmd()