Source code for Chapter_4.treebank_extract
__author__ = 'matt'
from lxml import etree
from collections import defaultdict
import re
[docs]class extractDependencies:
"""Extracts specified syntactic relationships from PROIEL-formatted dependency treebank data.
At this point, each specific relationship uses a different function, e.g., ``get_objects`` retrieves all of the
tokens for which the target token is the head.
:param target: The target word that is being analyzed.
:type target: str
:param orig: The full path to the XML file containing the treebank information
:type orig: str
:param relation: The code for the syntactic relationship that the target word should share with the other word (e.g., "sub")
:type relation: str
:param form: Whether to use the dictionary form of the target word ("lemma") or its inflected form ("form")
:type form: str
"""
def __init__(self, target, orig, relation='sub', form='lemma'):
self.target = target
self.treebank = etree.parse(orig).getroot()
self.relation = relation
if form not in ['form', 'lemma']:
print('Only "form" and "lemma" are valid values for form.')
self.form = 'lemma'
else:
self.form = form
def find_occs(self):
self.occs = self.treebank.xpath('/proiel/source/div/sentence/token[@{0}="{1}"]'.format(self.form, self.target))
def get_objects(self):
self.dependents = []
for occ in self.occs:
h = occ.get('id')
for s in self.treebank.xpath('/proiel/source/div/sentence/token[@head-id="{}"]'.format(h)):
#print(s.get('form'), s.get('relation'), s.get('citation-part'))
while s.get('empty-token-sort') and s.get('antecedent-id'):
s = self.treebank.xpath('/proiel/source/div/sentence/token[@id="{}"]'.format(s.get('antecedent-id')))[0]
try:
if self.relation in s.get('relation'):
if s.get('part-of-speech') == 'R-':
p_obj = [x.get('form') for x in self.treebank.xpath('/proiel/source/div/sentence/token[@head-id="{}"]'.format(s.get('id')))]
self.dependents.append([occ.get('citation-part'), occ.get('form'), '{} {}'.format(s.get('lemma'), ' '.join(p_obj)), s.get('part-of-speech'), s.get('citation-part')])
else:
self.dependents.append([occ.get('citation-part'), occ.get('form'), s.get('form'), s.get('part-of-speech'), s.get('citation-part')])
except TypeError:
print(etree.tostring(s))
continue
def get_subjects(self):
self.dependents = []
for occ in self.occs:
h = occ.get('id')
for s in self.treebank.xpath('//token[@head-id="{}"]'.format(h)):
while s.get('empty-token-sort') and s.get('antecedent-id'):
s = self.treebank.xpath('//token[@id="{}"]'.format(s.get('antecedent-id')))[0]
try:
if s.get('relation') == 'sub':
self.dependents.append([occ.get('citation-part'), occ.get('form'), s.get('lemma'), s.get('part-of-speech'), s.get('citation-part')])
except TypeError:
print(etree.tostring(s))
continue
def get_Christos(self):
self.dependents = []
for occ in self.occs:
s = self.treebank.xpath('//token[@id="{}"]'.format(occ.get('head-id')))[0]
while s.get('empty-token-sort') and s.get('antecedent-id'):
s = self.treebank.xpath('//token[@id="{}"]'.format(s.get('antecedent-id')))[0]
while s.get('lemma') == "Ἰησοῦς" or s.get('lemma') == 'κύριος':
s = self.treebank.xpath('//token[@id="{}"]'.format(s.get('head-id')))[0]
try:
if 'N' in s.get('part-of-speech'):
self.dependents.append([occ.get('citation-part'), occ.get('form'), s.get('lemma'), s.get('part-of-speech'), s.get('citation-part')])
except TypeError:
print(etree.tostring(s))
continue
head_verses = defaultdict(list)
for x in self.dependents:
head_verses[x[2]].append(x[-1])
with open('/media/matt/Data/DissProject/Data/Chapter_4/christou_head_words_verses.csv', mode='w') as f:
f.write('Head word\tCount\tVerses\n')
for h in sorted(head_verses.keys(), key=lambda x: len(head_verses[x]), reverse=True):
f.write('{}\t{}\t{}\n'.format(h, len(head_verses[h]), head_verses[h]))
def get_genitive_deps(self):
self.dependents = []
for occ in self.occs:
h = occ.get('id')
for s in self.treebank.xpath('//token[@head-id="{}"]'.format(h)):
while s.get('empty-token-sort') and s.get('antecedent-id'):
s = self.treebank.xpath('//token[@id="{}"]'.format(s.get('antecedent-id')))[0]
try:
if self.relation == s.get('relation') and re.match(r'.{6}g.{3}', s.get('morphology')) and s.get('part-of-speech') != 'S-':
self.dependents.append([occ.get('citation-part'), occ.get('form'), s.get('form'), s.get('part-of-speech'), s.get('citation-part')])
except TypeError as E:
print(E, etree.tostring(s))
continue