Annotation of ECHO_content/ECHO_language.py, revision 1.3
1.1 dwinter 1: """Methoden fuer Language Technologies"""
1.2 dwinter 2:
1.1 dwinter 3: def donatus(txt2):
1.3 ! dwinter 4: import xmlrpclib
1.1 dwinter 5:
1.3 ! dwinter 6: server = xmlrpclib.ServerProxy("http://archimedes.fas.harvard.edu/cgi-bin/donatus-rpc")
1.1 dwinter 7:
1.3 ! dwinter 8: txt=txt2.encode('utf-8')
! 9: bin=xmlrpclib.Binary(txt)
1.1 dwinter 10:
1.3 ! dwinter 11:
1.1 dwinter 12:
1.3 ! dwinter 13: ret=server.donatus.analyze(bin)
1.1 dwinter 14:
1.3 ! dwinter 15:
! 16: return ret['morphData'].data
1.1 dwinter 17:
18:
19: def donatusVariant2Lemma(morphData):
1.3 ! dwinter 20: """creates hash variant -> morphdata"""
! 21: ret={}
! 22: dom=xml.dom.minidom.parseString(morphData)
! 23: lemmas=dom.getElementsByTagName('lemma')
! 24: for lemma in lemmas:
! 25: variants=lemma.getElementsByTagName('variant')
! 26: for variant in variants:
! 27: atr=variant.getAttribute('form')
! 28: if ret.has_key(atr):
! 29: ret[atr].append=lemma.getAttribute('form')
! 30: else:
! 31: ret[atr]=[lemma.getAttribute('form')]
1.1 dwinter 32:
1.3 ! dwinter 33: return ret
1.2 dwinter 34:
35: class ECHO_language:
1.3 ! dwinter 36: """language methods"""
! 37:
! 38: def donatusVariant2Lemma(self,nr='1'):
! 39: """analyze by donatus"""
! 40: return donatusVariant2Lemma(donatus(self.lemmatize(nr)))
! 41:
! 42: def tagLex(self,nr="1"):
! 43: """generate Links"""
! 44: global retLex
! 45: global toggle
! 46:
! 47: toggle=0
! 48: retLex=""
! 49:
! 50: lemmatized=self.lemmatize(nr)[0:]
! 51: #print "ho",repr(lemmatized)
! 52: variants=donatusVariant2Lemma(donatus(lemmatized))
! 53:
! 54: def createTag(name,attrs):
1.2 dwinter 55: global toggle
1.3 ! dwinter 56:
! 57: if name=="w":
! 58: toggle=1
! 59: return ""
! 60: else:
! 61: tag="<"
! 62: tag+=name
! 63: for attr in attrs.keys():
! 64: tag+=""" %s="%s" """%(attr,attrs[attr])
! 65: tag+=">"
! 66: return tag
! 67:
! 68: def createData(data):
! 69: global toggle
! 70: astring="""<a href="http://141.14.236.86/cgi-bin/toc/dict?step=remotetable;word=%s;lang=de" target="_blank">%s</a> """
! 71: if toggle: # tag war ein w
! 72: toggle=0
! 73: if variants.has_key(data):
! 74: return astring%(variants[data][0],data)
! 75: else:
! 76: return astring%(data,data)
! 77:
! 78:
! 79:
! 80: # 3 handler functions
! 81: def start_element(name, attrs):
! 82: global retLex
! 83:
! 84: retLex+=createTag(name,attrs)
! 85: def end_element(name):
! 86: global retLex
! 87: if not name=="w":
! 88: retLex+="</%s>"%(name.encode('utf-8'))
! 89:
! 90:
! 91: def char_data(data):
! 92: global retLex
! 93: if data:
! 94: try:
! 95: retLex+=createData(data)
! 96: except:
! 97: """no"""
! 98:
! 99: p = xml.parsers.expat.ParserCreate()
! 100:
! 101: p.StartElementHandler = start_element
! 102: p.EndElementHandler = end_element
! 103: p.CharacterDataHandler = char_data
! 104:
! 105: p.Parse(lemmatized.encode('utf-8'),1)
! 106: #print repr(lemmatized.encode('utf-8'))
! 107:
! 108: return retLex
! 109:
! 110:
! 111: def lemmatize(self,nr='1',lang="de"):
! 112: """lemmatize"""
! 113: global ret
! 114: ret=""
! 115:
! 116: def createTag(name,attrs):
! 117: tag="<"
! 118: tag+=name
! 119: for attr in attrs.keys():
! 120: tag+=""" %s="%s" """%(attr,attrs[attr])
! 121: tag+=">"
! 122: return tag
! 123:
! 124: def insertW(str):
! 125: splitted=str.split()
! 126: wordlist=["<w>%s</w>"%split for split in splitted]
! 127: return string.join(wordlist,'\n')
! 128:
! 129: # 3 handler functions
! 130: def start_element(name, attrs):
! 131: global ret
! 132: ret+=createTag(name,attrs)
! 133: def end_element(name):
! 134: global ret
! 135: ret+="</%s>"%(name.encode('utf-8'))
! 136:
! 137: def char_data(data):
! 138: global ret
! 139: ret+=insertW(data)
! 140:
! 141: p = xml.parsers.expat.ParserCreate()
! 142:
! 143: p.StartElementHandler = start_element
! 144: p.EndElementHandler = end_element
! 145: p.CharacterDataHandler = char_data
! 146:
! 147: p.Parse(self.getPage(nr), 1)
! 148: txt="""<wtag locator="xxx">
! 149: <section lang="%s"><s>%s</s></section>
! 150: </wtag>"""
! 151: ret=txt%(lang,ret)
! 152:
! 153: return ret
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>