--- ECHO_content/ECHO_language.py 2005/10/26 08:35:53 1.3
+++ ECHO_content/ECHO_language.py 2006/09/11 14:43:23 1.6
@@ -1,153 +1,21 @@
"""Methoden fuer Language Technologies"""
-def donatus(txt2):
- import xmlrpclib
- server = xmlrpclib.ServerProxy("http://archimedes.fas.harvard.edu/cgi-bin/donatus-rpc")
+from Products.ECHO_content.analyseAndTag.analyseAndTag import DonatusFile
- txt=txt2.encode('utf-8')
- bin=xmlrpclib.Binary(txt)
-
-
-
- ret=server.donatus.analyze(bin)
-
-
- return ret['morphData'].data
-
-
-def donatusVariant2Lemma(morphData):
- """creates hash variant -> morphdata"""
- ret={}
- dom=xml.dom.minidom.parseString(morphData)
- lemmas=dom.getElementsByTagName('lemma')
- for lemma in lemmas:
- variants=lemma.getElementsByTagName('variant')
- for variant in variants:
- atr=variant.getAttribute('form')
- if ret.has_key(atr):
- ret[atr].append=lemma.getAttribute('form')
- else:
- ret[atr]=[lemma.getAttribute('form')]
-
- return ret
+import xml.parsers
class ECHO_language:
"""language methods"""
- def donatusVariant2Lemma(self,nr='1'):
- """analyze by donatus"""
- return donatusVariant2Lemma(donatus(self.lemmatize(nr)))
-
- def tagLex(self,nr="1"):
- """generate Links"""
- global retLex
- global toggle
-
- toggle=0
- retLex=""
-
- lemmatized=self.lemmatize(nr)[0:]
- #print "ho",repr(lemmatized)
- variants=donatusVariant2Lemma(donatus(lemmatized))
-
- def createTag(name,attrs):
- global toggle
-
- if name=="w":
- toggle=1
- return ""
- else:
- tag="<"
- tag+=name
- for attr in attrs.keys():
- tag+=""" %s="%s" """%(attr,attrs[attr])
- tag+=">"
- return tag
-
- def createData(data):
- global toggle
- astring="""%s """
- if toggle: # tag war ein w
- toggle=0
- if variants.has_key(data):
- return astring%(variants[data][0],data)
- else:
- return astring%(data,data)
-
-
-
- # 3 handler functions
- def start_element(name, attrs):
- global retLex
-
- retLex+=createTag(name,attrs)
- def end_element(name):
- global retLex
- if not name=="w":
- retLex+="%s>"%(name.encode('utf-8'))
-
-
- def char_data(data):
- global retLex
- if data:
- try:
- retLex+=createData(data)
- except:
- """no"""
-
- p = xml.parsers.expat.ParserCreate()
-
- p.StartElementHandler = start_element
- p.EndElementHandler = end_element
- p.CharacterDataHandler = char_data
-
- p.Parse(lemmatized.encode('utf-8'),1)
- #print repr(lemmatized.encode('utf-8'))
- return retLex
+ def tagLex(self,nr="1"):
+ """gerateword tags"""
+ txt=self.getPage(_pn=nr)
+
+ df=DonatusFile(txt=self.getPage(_pn=nr),baseUri=self.baseUri)
+
+ return df.convertedXML()
+ #return DonatusFile(txt=self.getPage(_pn=nr)).convertedXML()
-
- def lemmatize(self,nr='1',lang="de"):
- """lemmatize"""
- global ret
- ret=""
-
- def createTag(name,attrs):
- tag="<"
- tag+=name
- for attr in attrs.keys():
- tag+=""" %s="%s" """%(attr,attrs[attr])
- tag+=">"
- return tag
-
- def insertW(str):
- splitted=str.split()
- wordlist=["%s"%split for split in splitted]
- return string.join(wordlist,'\n')
-
- # 3 handler functions
- def start_element(name, attrs):
- global ret
- ret+=createTag(name,attrs)
- def end_element(name):
- global ret
- ret+="%s>"%(name.encode('utf-8'))
-
- def char_data(data):
- global ret
- ret+=insertW(data)
-
- p = xml.parsers.expat.ParserCreate()
-
- p.StartElementHandler = start_element
- p.EndElementHandler = end_element
- p.CharacterDataHandler = char_data
-
- p.Parse(self.getPage(nr), 1)
- txt="""
-
- """
- ret=txt%(lang,ret)
-
- return ret
+
\ No newline at end of file