"""Methoden fuer Language Technologies""" from Products.ECHO_content.analyseAndTag.analyseAndTag import DonatusFile import xml.parsers def donatus(txt2): import xmlrpclib server = xmlrpclib.ServerProxy("http://archimedes.fas.harvard.edu/cgi-bin/donatus-rpc") txt=txt2.encode('utf-8') bin=xmlrpclib.Binary(txt) ret=server.donatus.analyze(bin) return ret['morphData'].data def donatusVariant2Lemma(morphData): """creates hash variant -> morphdata""" ret={} dom=xml.dom.minidom.parseString(morphData) lemmas=dom.getElementsByTagName('lemma') for lemma in lemmas: variants=lemma.getElementsByTagName('variant') for variant in variants: atr=variant.getAttribute('form') if ret.has_key(atr): ret[atr].append=lemma.getAttribute('form') else: ret[atr]=[lemma.getAttribute('form')] return ret class ECHO_language: """language methods""" def donatusVariant2Lemma(self,nr='1'): """analyze by donatus""" return donatusVariant2Lemma(donatus(self.lemmatize(nr))) def tagLex(self,nr="1"): """gerateLinks""" txt=self.getPage(_pn=nr) df=DonatusFile(txt=self.getPage(_pn=nr)) return df.wordsToLinks() #return DonatusFile(txt=self.getPage(_pn=nr)).convertedXML() def tagLex_old(self,nr="1"): """generate Links""" global retLex global toggle toggle=0 retLex="" lemmatized=self.lemmatize(nr)[0:] #print "ho",repr(lemmatized) variants=donatusVariant2Lemma(donatus(lemmatized)) def createTag(name,attrs): global toggle if name=="w": toggle=1 return "" else: tag="<" tag+=name for attr in attrs.keys(): tag+=""" %s="%s" """%(attr,attrs[attr]) tag+=">" return tag def createData(data): global toggle astring="""%s """ if toggle: # tag war ein w toggle=0 if variants.has_key(data): return astring%(variants[data][0],data) else: return astring%(data,data) # 3 handler functions def start_element(name, attrs): global retLex retLex+=createTag(name,attrs) def end_element(name): global retLex if not name=="w": retLex+=""%(name.encode('utf-8')) def char_data(data): global retLex if data: try: retLex+=createData(data) except: """no""" p = xml.parsers.expat.ParserCreate() p.StartElementHandler = start_element p.EndElementHandler = end_element p.CharacterDataHandler = char_data p.Parse(lemmatized.encode('utf-8'),1) #print repr(lemmatized.encode('utf-8')) return retLex def lemmatize(self,nr='1',lang="de"): """lemmatize""" global ret ret="" def createTag(name,attrs): tag="<" tag+=name for attr in attrs.keys(): tag+=""" %s="%s" """%(attr,attrs[attr]) tag+=">" return tag def insertW(str): splitted=str.split() wordlist=["%s"%split for split in splitted] return "\n".join(wordlist) # 3 handler functions def start_element(name, attrs): global ret ret+=createTag(name,attrs) def end_element(name): global ret ret+=""%(name.encode('utf-8')) def char_data(data): global ret ret+=insertW(data) p = xml.parsers.expat.ParserCreate() p.StartElementHandler = start_element p.EndElementHandler = end_element p.CharacterDataHandler = char_data p.Parse(self.getPage(nr), 1) txt="""
%s
""" ret=txt%(lang,ret) return ret