Source code for Chapter_1.consolidate_test_results

__author__ = 'matt'

from glob import glob
import os
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
from itertools import cycle


[docs]class win_tests: """ Collects and graphs the results of multiple runs of Data_Production.sem_extract_pipeline.ParamTester :param orig: the folder in which the .csv files containing the results are located :type orig: str :param corpus: the corpus that is being analyzed. This should be the same string used in the file names to designate the corpus (e.g., NT) :type corpus: str :param file_pattern: the file extension of the files containing the results :type file_pattern: str """ def __init__(self, orig, corpus, file_pattern='*.csv'): self.files = glob('{}/{}{}'.format(orig, corpus, file_pattern)) self.corpus = corpus self.orig = orig def build_df(self): d = defaultdict(dict) for x in sorted(self.files, key=lambda x: int(os.path.basename(x).split('_')[1])): with open(x) as f: for line in sorted(f.read().split('\n')[1:]): l = line.split('\t')[0].split('_') w = int(l[1].split('=')[1]) c = '{}_{}_{}'.format(l[0], l[2], l[3]) d[c][w] = float(line.split('\t')[-1]) self.df = pd.DataFrame(d) self.df.to_csv('{}/consolidated_{}.csv'.format(self.orig, self.corpus), sep='\t') def graph_it(self): rotate = cycle([45, -45]) offset = cycle([(0, 30), (0, -10)]) marker = cycle(['k*-', 'k.-', 'kx-', 'ko-']) nolems_max_coords = [ [self.df.ix[:, x].idxmax(), float(self.df.ix[:, x].max())] for x in self.df.columns if "lems=False" in x] lems_max_coords = [ [self.df.ix[:, x].idxmax(), float(self.df.ix[:, x].max())] for x in self.df.columns if "lems=True" in x] [plt.plot(self.df.ix[:, x], marker.__next__(), label=' '.join([x.split('_')[0], x.split('_')[2]])) for x in self.df.columns if "lems=False" in x] [plt.annotate(s=str(round(x[1], 4)), xy=x, xytext=offset.__next__(), textcoords='offset points', rotation=rotate.__next__()) for x in nolems_max_coords] plt.legend(loc=0, fontsize='small') plt.xticks(self.df.index) plt.xlim(self.df.index[0], self.df.index[-1]) plt.xlabel('Window Size') plt.ylabel('Category Z-Score') plt.grid(True) #plt.tight_layout() plt.savefig('{}/nolem_graph.png'.format(self.orig), dpi=500) plt.clf() if lems_max_coords: [plt.plot(self.df.ix[:, x], marker.__next__(), label=' '.join([x.split('_')[0], x.split('_')[2]])) for x in self.df.columns if "lems=True" in x] [plt.annotate(s=str(round(x[1], 4)), xy=x, xytext=offset.__next__(), textcoords='offset points', rotation=rotate.__next__()) for x in lems_max_coords] plt.legend(loc=0, fontsize='small') plt.xticks(self.df.index) plt.xlim(self.df.index[0], self.df.index[-1]) plt.xlabel('Window Size') plt.ylabel('Category Z-Score') plt.grid(True) #plt.tight_layout() plt.savefig('{}/lem_graph.png'.format(self.orig), dpi=500) plt.clf()