Source code for Chapter_4.treebank_extract

__author__ = 'matt'

from lxml import etree
from collections import defaultdict
import re


[docs]class extractDependencies: """Extracts specified syntactic relationships from PROIEL-formatted dependency treebank data. At this point, each specific relationship uses a different function, e.g., ``get_objects`` retrieves all of the tokens for which the target token is the head. :param target: The target word that is being analyzed. :type target: str :param orig: The full path to the XML file containing the treebank information :type orig: str :param relation: The code for the syntactic relationship that the target word should share with the other word (e.g., "sub") :type relation: str :param form: Whether to use the dictionary form of the target word ("lemma") or its inflected form ("form") :type form: str """ def __init__(self, target, orig, relation='sub', form='lemma'): self.target = target self.treebank = etree.parse(orig).getroot() self.relation = relation if form not in ['form', 'lemma']: print('Only "form" and "lemma" are valid values for form.') self.form = 'lemma' else: self.form = form def find_occs(self): self.occs = self.treebank.xpath('/proiel/source/div/sentence/token[@{0}="{1}"]'.format(self.form, self.target)) def get_objects(self): self.dependents = [] for occ in self.occs: h = occ.get('id') for s in self.treebank.xpath('/proiel/source/div/sentence/token[@head-id="{}"]'.format(h)): #print(s.get('form'), s.get('relation'), s.get('citation-part')) while s.get('empty-token-sort') and s.get('antecedent-id'): s = self.treebank.xpath('/proiel/source/div/sentence/token[@id="{}"]'.format(s.get('antecedent-id')))[0] try: if self.relation in s.get('relation'): if s.get('part-of-speech') == 'R-': p_obj = [x.get('form') for x in self.treebank.xpath('/proiel/source/div/sentence/token[@head-id="{}"]'.format(s.get('id')))] self.dependents.append([occ.get('citation-part'), occ.get('form'), '{} {}'.format(s.get('lemma'), ' '.join(p_obj)), s.get('part-of-speech'), s.get('citation-part')]) else: self.dependents.append([occ.get('citation-part'), occ.get('form'), s.get('form'), s.get('part-of-speech'), s.get('citation-part')]) except TypeError: print(etree.tostring(s)) continue def get_subjects(self): self.dependents = [] for occ in self.occs: h = occ.get('id') for s in self.treebank.xpath('//token[@head-id="{}"]'.format(h)): while s.get('empty-token-sort') and s.get('antecedent-id'): s = self.treebank.xpath('//token[@id="{}"]'.format(s.get('antecedent-id')))[0] try: if s.get('relation') == 'sub': self.dependents.append([occ.get('citation-part'), occ.get('form'), s.get('lemma'), s.get('part-of-speech'), s.get('citation-part')]) except TypeError: print(etree.tostring(s)) continue def get_Christos(self): self.dependents = [] for occ in self.occs: s = self.treebank.xpath('//token[@id="{}"]'.format(occ.get('head-id')))[0] while s.get('empty-token-sort') and s.get('antecedent-id'): s = self.treebank.xpath('//token[@id="{}"]'.format(s.get('antecedent-id')))[0] while s.get('lemma') == "Ἰησοῦς" or s.get('lemma') == 'κύριος': s = self.treebank.xpath('//token[@id="{}"]'.format(s.get('head-id')))[0] try: if 'N' in s.get('part-of-speech'): self.dependents.append([occ.get('citation-part'), occ.get('form'), s.get('lemma'), s.get('part-of-speech'), s.get('citation-part')]) except TypeError: print(etree.tostring(s)) continue head_verses = defaultdict(list) for x in self.dependents: head_verses[x[2]].append(x[-1]) with open('/media/matt/Data/DissProject/Data/Chapter_4/christou_head_words_verses.csv', mode='w') as f: f.write('Head word\tCount\tVerses\n') for h in sorted(head_verses.keys(), key=lambda x: len(head_verses[x]), reverse=True): f.write('{}\t{}\t{}\n'.format(h, len(head_verses[h]), head_verses[h])) def get_genitive_deps(self): self.dependents = [] for occ in self.occs: h = occ.get('id') for s in self.treebank.xpath('//token[@head-id="{}"]'.format(h)): while s.get('empty-token-sort') and s.get('antecedent-id'): s = self.treebank.xpath('//token[@id="{}"]'.format(s.get('antecedent-id')))[0] try: if self.relation == s.get('relation') and re.match(r'.{6}g.{3}', s.get('morphology')) and s.get('part-of-speech') != 'S-': self.dependents.append([occ.get('citation-part'), occ.get('form'), s.get('form'), s.get('part-of-speech'), s.get('citation-part')]) except TypeError as E: print(E, etree.tostring(s)) continue