Annotation of ECHO_content/ECHO_language.py, revision 1.4
1.1 dwinter 1: """Methoden fuer Language Technologies"""
1.2 dwinter 2:
1.4 ! dwinter 3:
! 4: from Products.ECHO_content.analyseAndTag.analyseAndTag import DonatusFile
! 5:
! 6: import xml.parsers
! 7:
1.1 dwinter 8: def donatus(txt2):
1.3 dwinter 9: import xmlrpclib
1.1 dwinter 10:
1.3 dwinter 11: server = xmlrpclib.ServerProxy("http://archimedes.fas.harvard.edu/cgi-bin/donatus-rpc")
1.1 dwinter 12:
1.3 dwinter 13: txt=txt2.encode('utf-8')
14: bin=xmlrpclib.Binary(txt)
1.1 dwinter 15:
1.3 dwinter 16:
1.1 dwinter 17:
1.3 dwinter 18: ret=server.donatus.analyze(bin)
1.1 dwinter 19:
1.3 dwinter 20:
21: return ret['morphData'].data
1.1 dwinter 22:
23:
24: def donatusVariant2Lemma(morphData):
1.3 dwinter 25: """creates hash variant -> morphdata"""
26: ret={}
27: dom=xml.dom.minidom.parseString(morphData)
28: lemmas=dom.getElementsByTagName('lemma')
29: for lemma in lemmas:
30: variants=lemma.getElementsByTagName('variant')
31: for variant in variants:
32: atr=variant.getAttribute('form')
33: if ret.has_key(atr):
34: ret[atr].append=lemma.getAttribute('form')
35: else:
36: ret[atr]=[lemma.getAttribute('form')]
1.1 dwinter 37:
1.3 dwinter 38: return ret
1.2 dwinter 39:
40: class ECHO_language:
1.3 dwinter 41: """language methods"""
42:
43: def donatusVariant2Lemma(self,nr='1'):
44: """analyze by donatus"""
45: return donatusVariant2Lemma(donatus(self.lemmatize(nr)))
46:
1.4 ! dwinter 47:
1.3 dwinter 48: def tagLex(self,nr="1"):
1.4 ! dwinter 49: """gerateLinks"""
! 50: txt=self.getPage(_pn=nr)
! 51:
! 52: df=DonatusFile(txt=self.getPage(_pn=nr))
! 53:
! 54: return df.wordsToLinks()
! 55: #return DonatusFile(txt=self.getPage(_pn=nr)).convertedXML()
! 56:
! 57: def tagLex_old(self,nr="1"):
1.3 dwinter 58: """generate Links"""
59: global retLex
60: global toggle
61:
62: toggle=0
63: retLex=""
64:
65: lemmatized=self.lemmatize(nr)[0:]
66: #print "ho",repr(lemmatized)
67: variants=donatusVariant2Lemma(donatus(lemmatized))
68:
69: def createTag(name,attrs):
1.2 dwinter 70: global toggle
1.3 dwinter 71:
72: if name=="w":
73: toggle=1
74: return ""
75: else:
76: tag="<"
77: tag+=name
78: for attr in attrs.keys():
79: tag+=""" %s="%s" """%(attr,attrs[attr])
80: tag+=">"
81: return tag
82:
83: def createData(data):
84: global toggle
85: astring="""<a href="http://141.14.236.86/cgi-bin/toc/dict?step=remotetable;word=%s;lang=de" target="_blank">%s</a> """
86: if toggle: # tag war ein w
87: toggle=0
88: if variants.has_key(data):
89: return astring%(variants[data][0],data)
90: else:
91: return astring%(data,data)
92:
93:
94:
95: # 3 handler functions
96: def start_element(name, attrs):
97: global retLex
98:
99: retLex+=createTag(name,attrs)
100: def end_element(name):
101: global retLex
102: if not name=="w":
103: retLex+="</%s>"%(name.encode('utf-8'))
104:
105:
106: def char_data(data):
107: global retLex
108: if data:
109: try:
110: retLex+=createData(data)
111: except:
112: """no"""
113:
114: p = xml.parsers.expat.ParserCreate()
115:
116: p.StartElementHandler = start_element
117: p.EndElementHandler = end_element
118: p.CharacterDataHandler = char_data
119:
120: p.Parse(lemmatized.encode('utf-8'),1)
121: #print repr(lemmatized.encode('utf-8'))
122:
123: return retLex
124:
125:
126: def lemmatize(self,nr='1',lang="de"):
127: """lemmatize"""
128: global ret
129: ret=""
130:
131: def createTag(name,attrs):
132: tag="<"
133: tag+=name
134: for attr in attrs.keys():
135: tag+=""" %s="%s" """%(attr,attrs[attr])
136: tag+=">"
137: return tag
138:
139: def insertW(str):
140: splitted=str.split()
141: wordlist=["<w>%s</w>"%split for split in splitted]
1.4 ! dwinter 142: return "\n".join(wordlist)
1.3 dwinter 143:
144: # 3 handler functions
145: def start_element(name, attrs):
146: global ret
147: ret+=createTag(name,attrs)
148: def end_element(name):
149: global ret
150: ret+="</%s>"%(name.encode('utf-8'))
151:
152: def char_data(data):
153: global ret
154: ret+=insertW(data)
155:
156: p = xml.parsers.expat.ParserCreate()
157:
158: p.StartElementHandler = start_element
159: p.EndElementHandler = end_element
160: p.CharacterDataHandler = char_data
161:
162: p.Parse(self.getPage(nr), 1)
163: txt="""<wtag locator="xxx">
164: <section lang="%s"><s>%s</s></section>
165: </wtag>"""
166: ret=txt%(lang,ret)
167:
168: return ret
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>