version 1.3, 2005/10/26 08:35:53
|
version 1.6, 2006/09/11 14:43:23
|
Line 1
|
Line 1
|
"""Methoden fuer Language Technologies""" |
"""Methoden fuer Language Technologies""" |
|
|
def donatus(txt2): |
|
import xmlrpclib |
|
|
|
server = xmlrpclib.ServerProxy("http://archimedes.fas.harvard.edu/cgi-bin/donatus-rpc") |
from Products.ECHO_content.analyseAndTag.analyseAndTag import DonatusFile |
|
|
txt=txt2.encode('utf-8') |
import xml.parsers |
bin=xmlrpclib.Binary(txt) |
|
|
|
|
|
|
|
ret=server.donatus.analyze(bin) |
|
|
|
|
|
return ret['morphData'].data |
|
|
|
|
|
def donatusVariant2Lemma(morphData): |
|
"""creates hash variant -> morphdata""" |
|
ret={} |
|
dom=xml.dom.minidom.parseString(morphData) |
|
lemmas=dom.getElementsByTagName('lemma') |
|
for lemma in lemmas: |
|
variants=lemma.getElementsByTagName('variant') |
|
for variant in variants: |
|
atr=variant.getAttribute('form') |
|
if ret.has_key(atr): |
|
ret[atr].append=lemma.getAttribute('form') |
|
else: |
|
ret[atr]=[lemma.getAttribute('form')] |
|
|
|
return ret |
|
|
|
class ECHO_language: |
class ECHO_language: |
"""language methods""" |
"""language methods""" |
|
|
def donatusVariant2Lemma(self,nr='1'): |
|
"""analyze by donatus""" |
|
return donatusVariant2Lemma(donatus(self.lemmatize(nr))) |
|
|
|
def tagLex(self,nr="1"): |
def tagLex(self,nr="1"): |
"""generate Links""" |
"""gerateword tags""" |
global retLex |
txt=self.getPage(_pn=nr) |
global toggle |
|
|
df=DonatusFile(txt=self.getPage(_pn=nr),baseUri=self.baseUri) |
toggle=0 |
|
retLex="" |
return df.convertedXML() |
|
#return DonatusFile(txt=self.getPage(_pn=nr)).convertedXML() |
lemmatized=self.lemmatize(nr)[0:] |
|
#print "ho",repr(lemmatized) |
|
variants=donatusVariant2Lemma(donatus(lemmatized)) |
|
|
|
def createTag(name,attrs): |
|
global toggle |
|
|
|
if name=="w": |
|
toggle=1 |
|
return "" |
|
else: |
|
tag="<" |
|
tag+=name |
|
for attr in attrs.keys(): |
|
tag+=""" %s="%s" """%(attr,attrs[attr]) |
|
tag+=">" |
|
return tag |
|
|
|
def createData(data): |
|
global toggle |
|
astring="""<a href="http://141.14.236.86/cgi-bin/toc/dict?step=remotetable;word=%s;lang=de" target="_blank">%s</a> """ |
|
if toggle: # tag war ein w |
|
toggle=0 |
|
if variants.has_key(data): |
|
return astring%(variants[data][0],data) |
|
else: |
|
return astring%(data,data) |
|
|
|
|
|
|
|
# 3 handler functions |
|
def start_element(name, attrs): |
|
global retLex |
|
|
|
retLex+=createTag(name,attrs) |
|
def end_element(name): |
|
global retLex |
|
if not name=="w": |
|
retLex+="</%s>"%(name.encode('utf-8')) |
|
|
|
|
|
def char_data(data): |
|
global retLex |
|
if data: |
|
try: |
|
retLex+=createData(data) |
|
except: |
|
"""no""" |
|
|
|
p = xml.parsers.expat.ParserCreate() |
|
|
|
p.StartElementHandler = start_element |
|
p.EndElementHandler = end_element |
|
p.CharacterDataHandler = char_data |
|
|
|
p.Parse(lemmatized.encode('utf-8'),1) |
|
#print repr(lemmatized.encode('utf-8')) |
|
|
|
return retLex |
|
|
|
|
|
def lemmatize(self,nr='1',lang="de"): |
|
"""lemmatize""" |
|
global ret |
|
ret="" |
|
|
|
def createTag(name,attrs): |
|
tag="<" |
|
tag+=name |
|
for attr in attrs.keys(): |
|
tag+=""" %s="%s" """%(attr,attrs[attr]) |
|
tag+=">" |
|
return tag |
|
|
|
def insertW(str): |
|
splitted=str.split() |
|
wordlist=["<w>%s</w>"%split for split in splitted] |
|
return string.join(wordlist,'\n') |
|
|
|
# 3 handler functions |
|
def start_element(name, attrs): |
|
global ret |
|
ret+=createTag(name,attrs) |
|
def end_element(name): |
|
global ret |
|
ret+="</%s>"%(name.encode('utf-8')) |
|
|
|
def char_data(data): |
|
global ret |
|
ret+=insertW(data) |
|
|
|
p = xml.parsers.expat.ParserCreate() |
|
|
|
p.StartElementHandler = start_element |
|
p.EndElementHandler = end_element |
|
p.CharacterDataHandler = char_data |
|
|
|
p.Parse(self.getPage(nr), 1) |
|
txt="""<wtag locator="xxx"> |
|
<section lang="%s"><s>%s</s></section> |
|
</wtag>""" |
|
ret=txt%(lang,ret) |
|
|
|
return ret |
|