--- ECHO_content/ECHO_language.py 2004/11/24 15:17:41 1.1 +++ ECHO_content/ECHO_language.py 2004/11/27 22:38:44 1.2 @@ -1,4 +1,5 @@ """Methoden fuer Language Technologies""" + def donatus(txt2): import xmlrpclib @@ -30,3 +31,123 @@ def donatusVariant2Lemma(morphData): ret[atr]=[lemma.getAttribute('form')] return ret + +class ECHO_language: + """language methods""" + + def donatusVariant2Lemma(self,nr='1'): + """analyze by donatus""" + return donatusVariant2Lemma(donatus(self.lemmatize(nr))) + + def tagLex(self,nr="1"): + """generate Links""" + global retLex + global toggle + + toggle=0 + retLex="" + + lemmatized=self.lemmatize(nr)[0:] + #print "ho",repr(lemmatized) + variants=donatusVariant2Lemma(donatus(lemmatized)) + + def createTag(name,attrs): + global toggle + + if name=="w": + toggle=1 + return "" + else: + tag="<" + tag+=name + for attr in attrs.keys(): + tag+=""" %s="%s" """%(attr,attrs[attr]) + tag+=">" + return tag + + def createData(data): + global toggle + astring="""%s """ + if toggle: # tag war ein w + toggle=0 + if variants.has_key(data): + return astring%(variants[data][0],data) + else: + return astring%(data,data) + + + + # 3 handler functions + def start_element(name, attrs): + global retLex + + retLex+=createTag(name,attrs) + def end_element(name): + global retLex + if not name=="w": + retLex+=""%(name.encode('utf-8')) + + + def char_data(data): + global retLex + if data: + try: + retLex+=createData(data) + except: + """no""" + + p = xml.parsers.expat.ParserCreate() + + p.StartElementHandler = start_element + p.EndElementHandler = end_element + p.CharacterDataHandler = char_data + + p.Parse(lemmatized.encode('utf-8'),1) + #print repr(lemmatized.encode('utf-8')) + + return retLex + + + def lemmatize(self,nr='1',lang="de"): + """lemmatize""" + global ret + ret="" + + def createTag(name,attrs): + tag="<" + tag+=name + for attr in attrs.keys(): + tag+=""" %s="%s" """%(attr,attrs[attr]) + tag+=">" + return tag + + def insertW(str): + splitted=str.split() + wordlist=["%s"%split for split in splitted] + return string.join(wordlist,'\n') + + # 3 handler functions + def start_element(name, attrs): + global ret + ret+=createTag(name,attrs) + def end_element(name): + global ret + ret+=""%(name.encode('utf-8')) + + def char_data(data): + global ret + ret+=insertW(data) + + p = xml.parsers.expat.ParserCreate() + + p.StartElementHandler = start_element + p.EndElementHandler = end_element + p.CharacterDataHandler = char_data + + p.Parse(self.getPage(nr), 1) + txt=""" +
%s
+
""" + ret=txt%(lang,ret) + + return ret