--- ECHO_content/ECHO_language.py 2004/11/27 22:38:44 1.2 +++ ECHO_content/ECHO_language.py 2005/10/26 08:35:53 1.3 @@ -1,153 +1,153 @@ """Methoden fuer Language Technologies""" def donatus(txt2): - import xmlrpclib + import xmlrpclib - server = xmlrpclib.ServerProxy("http://archimedes.fas.harvard.edu/cgi-bin/donatus-rpc") + server = xmlrpclib.ServerProxy("http://archimedes.fas.harvard.edu/cgi-bin/donatus-rpc") - txt=txt2.encode('utf-8') - bin=xmlrpclib.Binary(txt) + txt=txt2.encode('utf-8') + bin=xmlrpclib.Binary(txt) - + - ret=server.donatus.analyze(bin) + ret=server.donatus.analyze(bin) - - return ret['morphData'].data + + return ret['morphData'].data def donatusVariant2Lemma(morphData): - """creates hash variant -> morphdata""" - ret={} - dom=xml.dom.minidom.parseString(morphData) - lemmas=dom.getElementsByTagName('lemma') - for lemma in lemmas: - variants=lemma.getElementsByTagName('variant') - for variant in variants: - atr=variant.getAttribute('form') - if ret.has_key(atr): - ret[atr].append=lemma.getAttribute('form') - else: - ret[atr]=[lemma.getAttribute('form')] + """creates hash variant -> morphdata""" + ret={} + dom=xml.dom.minidom.parseString(morphData) + lemmas=dom.getElementsByTagName('lemma') + for lemma in lemmas: + variants=lemma.getElementsByTagName('variant') + for variant in variants: + atr=variant.getAttribute('form') + if ret.has_key(atr): + ret[atr].append=lemma.getAttribute('form') + else: + ret[atr]=[lemma.getAttribute('form')] - return ret + return ret class ECHO_language: - """language methods""" - - def donatusVariant2Lemma(self,nr='1'): - """analyze by donatus""" - return donatusVariant2Lemma(donatus(self.lemmatize(nr))) - - def tagLex(self,nr="1"): - """generate Links""" - global retLex - global toggle - - toggle=0 - retLex="" - - lemmatized=self.lemmatize(nr)[0:] - #print "ho",repr(lemmatized) - variants=donatusVariant2Lemma(donatus(lemmatized)) - - def createTag(name,attrs): + """language methods""" + + def donatusVariant2Lemma(self,nr='1'): + """analyze by donatus""" + return donatusVariant2Lemma(donatus(self.lemmatize(nr))) + + def tagLex(self,nr="1"): + """generate Links""" + global retLex + global toggle + + toggle=0 + retLex="" + + lemmatized=self.lemmatize(nr)[0:] + #print "ho",repr(lemmatized) + variants=donatusVariant2Lemma(donatus(lemmatized)) + + def createTag(name,attrs): global toggle - - if name=="w": - toggle=1 - return "" - else: - tag="<" - tag+=name - for attr in attrs.keys(): - tag+=""" %s="%s" """%(attr,attrs[attr]) - tag+=">" - return tag - - def createData(data): - global toggle - astring="""%s """ - if toggle: # tag war ein w - toggle=0 - if variants.has_key(data): - return astring%(variants[data][0],data) - else: - return astring%(data,data) - - - - # 3 handler functions - def start_element(name, attrs): - global retLex - - retLex+=createTag(name,attrs) - def end_element(name): - global retLex - if not name=="w": - retLex+=""%(name.encode('utf-8')) - - - def char_data(data): - global retLex - if data: - try: - retLex+=createData(data) - except: - """no""" - - p = xml.parsers.expat.ParserCreate() - - p.StartElementHandler = start_element - p.EndElementHandler = end_element - p.CharacterDataHandler = char_data - - p.Parse(lemmatized.encode('utf-8'),1) - #print repr(lemmatized.encode('utf-8')) - - return retLex - - - def lemmatize(self,nr='1',lang="de"): - """lemmatize""" - global ret - ret="" - - def createTag(name,attrs): - tag="<" - tag+=name - for attr in attrs.keys(): - tag+=""" %s="%s" """%(attr,attrs[attr]) - tag+=">" - return tag - - def insertW(str): - splitted=str.split() - wordlist=["%s"%split for split in splitted] - return string.join(wordlist,'\n') - - # 3 handler functions - def start_element(name, attrs): - global ret - ret+=createTag(name,attrs) - def end_element(name): - global ret - ret+=""%(name.encode('utf-8')) - - def char_data(data): - global ret - ret+=insertW(data) - - p = xml.parsers.expat.ParserCreate() - - p.StartElementHandler = start_element - p.EndElementHandler = end_element - p.CharacterDataHandler = char_data - - p.Parse(self.getPage(nr), 1) - txt=""" -
%s
-
""" - ret=txt%(lang,ret) - - return ret + + if name=="w": + toggle=1 + return "" + else: + tag="<" + tag+=name + for attr in attrs.keys(): + tag+=""" %s="%s" """%(attr,attrs[attr]) + tag+=">" + return tag + + def createData(data): + global toggle + astring="""%s """ + if toggle: # tag war ein w + toggle=0 + if variants.has_key(data): + return astring%(variants[data][0],data) + else: + return astring%(data,data) + + + + # 3 handler functions + def start_element(name, attrs): + global retLex + + retLex+=createTag(name,attrs) + def end_element(name): + global retLex + if not name=="w": + retLex+=""%(name.encode('utf-8')) + + + def char_data(data): + global retLex + if data: + try: + retLex+=createData(data) + except: + """no""" + + p = xml.parsers.expat.ParserCreate() + + p.StartElementHandler = start_element + p.EndElementHandler = end_element + p.CharacterDataHandler = char_data + + p.Parse(lemmatized.encode('utf-8'),1) + #print repr(lemmatized.encode('utf-8')) + + return retLex + + + def lemmatize(self,nr='1',lang="de"): + """lemmatize""" + global ret + ret="" + + def createTag(name,attrs): + tag="<" + tag+=name + for attr in attrs.keys(): + tag+=""" %s="%s" """%(attr,attrs[attr]) + tag+=">" + return tag + + def insertW(str): + splitted=str.split() + wordlist=["%s"%split for split in splitted] + return string.join(wordlist,'\n') + + # 3 handler functions + def start_element(name, attrs): + global ret + ret+=createTag(name,attrs) + def end_element(name): + global ret + ret+=""%(name.encode('utf-8')) + + def char_data(data): + global ret + ret+=insertW(data) + + p = xml.parsers.expat.ParserCreate() + + p.StartElementHandler = start_element + p.EndElementHandler = end_element + p.CharacterDataHandler = char_data + + p.Parse(self.getPage(nr), 1) + txt=""" +
%s
+
""" + ret=txt%(lang,ret) + + return ret