--- ECHO_content/ECHO_language.py 2004/11/27 22:38:44 1.2 +++ ECHO_content/ECHO_language.py 2006/09/10 22:57:38 1.5 @@ -1,153 +1,21 @@ """Methoden fuer Language Technologies""" -def donatus(txt2): - import xmlrpclib - server = xmlrpclib.ServerProxy("http://archimedes.fas.harvard.edu/cgi-bin/donatus-rpc") +from Products.ECHO_content.analyseAndTag.analyseAndTag import DonatusFile - txt=txt2.encode('utf-8') - bin=xmlrpclib.Binary(txt) - - - - ret=server.donatus.analyze(bin) - - - return ret['morphData'].data - - -def donatusVariant2Lemma(morphData): - """creates hash variant -> morphdata""" - ret={} - dom=xml.dom.minidom.parseString(morphData) - lemmas=dom.getElementsByTagName('lemma') - for lemma in lemmas: - variants=lemma.getElementsByTagName('variant') - for variant in variants: - atr=variant.getAttribute('form') - if ret.has_key(atr): - ret[atr].append=lemma.getAttribute('form') - else: - ret[atr]=[lemma.getAttribute('form')] - - return ret +import xml.parsers class ECHO_language: - """language methods""" - - def donatusVariant2Lemma(self,nr='1'): - """analyze by donatus""" - return donatusVariant2Lemma(donatus(self.lemmatize(nr))) - - def tagLex(self,nr="1"): - """generate Links""" - global retLex - global toggle - - toggle=0 - retLex="" - - lemmatized=self.lemmatize(nr)[0:] - #print "ho",repr(lemmatized) - variants=donatusVariant2Lemma(donatus(lemmatized)) - - def createTag(name,attrs): - global toggle - - if name=="w": - toggle=1 - return "" - else: - tag="<" - tag+=name - for attr in attrs.keys(): - tag+=""" %s="%s" """%(attr,attrs[attr]) - tag+=">" - return tag - - def createData(data): - global toggle - astring="""%s """ - if toggle: # tag war ein w - toggle=0 - if variants.has_key(data): - return astring%(variants[data][0],data) - else: - return astring%(data,data) - - - - # 3 handler functions - def start_element(name, attrs): - global retLex - - retLex+=createTag(name,attrs) - def end_element(name): - global retLex - if not name=="w": - retLex+=""%(name.encode('utf-8')) - - - def char_data(data): - global retLex - if data: - try: - retLex+=createData(data) - except: - """no""" - - p = xml.parsers.expat.ParserCreate() - - p.StartElementHandler = start_element - p.EndElementHandler = end_element - p.CharacterDataHandler = char_data - - p.Parse(lemmatized.encode('utf-8'),1) - #print repr(lemmatized.encode('utf-8')) - - return retLex - - - def lemmatize(self,nr='1',lang="de"): - """lemmatize""" - global ret - ret="" - - def createTag(name,attrs): - tag="<" - tag+=name - for attr in attrs.keys(): - tag+=""" %s="%s" """%(attr,attrs[attr]) - tag+=">" - return tag - - def insertW(str): - splitted=str.split() - wordlist=["%s"%split for split in splitted] - return string.join(wordlist,'\n') - - # 3 handler functions - def start_element(name, attrs): - global ret - ret+=createTag(name,attrs) - def end_element(name): - global ret - ret+=""%(name.encode('utf-8')) - - def char_data(data): - global ret - ret+=insertW(data) - - p = xml.parsers.expat.ParserCreate() - - p.StartElementHandler = start_element - p.EndElementHandler = end_element - p.CharacterDataHandler = char_data - - p.Parse(self.getPage(nr), 1) - txt=""" -
%s
-
""" - ret=txt%(lang,ret) - - return ret + """language methods""" + + + def tagLex(self,nr="1"): + """gerateword tags""" + txt=self.getPage(_pn=nr) + + df=DonatusFile(txt=self.getPage(_pn=nr)) + + return df.convertedXML() + #return DonatusFile(txt=self.getPage(_pn=nr)).convertedXML() + + \ No newline at end of file