1: """Methoden fuer Language Technologies"""
2:
3: def donatus(txt2):
4: import xmlrpclib
5:
6: server = xmlrpclib.ServerProxy("http://archimedes.fas.harvard.edu/cgi-bin/donatus-rpc")
7:
8: txt=txt2.encode('utf-8')
9: bin=xmlrpclib.Binary(txt)
10:
11:
12:
13: ret=server.donatus.analyze(bin)
14:
15:
16: return ret['morphData'].data
17:
18:
19: def donatusVariant2Lemma(morphData):
20: """creates hash variant -> morphdata"""
21: ret={}
22: dom=xml.dom.minidom.parseString(morphData)
23: lemmas=dom.getElementsByTagName('lemma')
24: for lemma in lemmas:
25: variants=lemma.getElementsByTagName('variant')
26: for variant in variants:
27: atr=variant.getAttribute('form')
28: if ret.has_key(atr):
29: ret[atr].append=lemma.getAttribute('form')
30: else:
31: ret[atr]=[lemma.getAttribute('form')]
32:
33: return ret
34:
35: class ECHO_language:
36: """language methods"""
37:
38: def donatusVariant2Lemma(self,nr='1'):
39: """analyze by donatus"""
40: return donatusVariant2Lemma(donatus(self.lemmatize(nr)))
41:
42: def tagLex(self,nr="1"):
43: """generate Links"""
44: global retLex
45: global toggle
46:
47: toggle=0
48: retLex=""
49:
50: lemmatized=self.lemmatize(nr)[0:]
51: #print "ho",repr(lemmatized)
52: variants=donatusVariant2Lemma(donatus(lemmatized))
53:
54: def createTag(name,attrs):
55: global toggle
56:
57: if name=="w":
58: toggle=1
59: return ""
60: else:
61: tag="<"
62: tag+=name
63: for attr in attrs.keys():
64: tag+=""" %s="%s" """%(attr,attrs[attr])
65: tag+=">"
66: return tag
67:
68: def createData(data):
69: global toggle
70: astring="""<a href="http://141.14.236.86/cgi-bin/toc/dict?step=remotetable;word=%s;lang=de" target="_blank">%s</a> """
71: if toggle: # tag war ein w
72: toggle=0
73: if variants.has_key(data):
74: return astring%(variants[data][0],data)
75: else:
76: return astring%(data,data)
77:
78:
79:
80: # 3 handler functions
81: def start_element(name, attrs):
82: global retLex
83:
84: retLex+=createTag(name,attrs)
85: def end_element(name):
86: global retLex
87: if not name=="w":
88: retLex+="</%s>"%(name.encode('utf-8'))
89:
90:
91: def char_data(data):
92: global retLex
93: if data:
94: try:
95: retLex+=createData(data)
96: except:
97: """no"""
98:
99: p = xml.parsers.expat.ParserCreate()
100:
101: p.StartElementHandler = start_element
102: p.EndElementHandler = end_element
103: p.CharacterDataHandler = char_data
104:
105: p.Parse(lemmatized.encode('utf-8'),1)
106: #print repr(lemmatized.encode('utf-8'))
107:
108: return retLex
109:
110:
111: def lemmatize(self,nr='1',lang="de"):
112: """lemmatize"""
113: global ret
114: ret=""
115:
116: def createTag(name,attrs):
117: tag="<"
118: tag+=name
119: for attr in attrs.keys():
120: tag+=""" %s="%s" """%(attr,attrs[attr])
121: tag+=">"
122: return tag
123:
124: def insertW(str):
125: splitted=str.split()
126: wordlist=["<w>%s</w>"%split for split in splitted]
127: return string.join(wordlist,'\n')
128:
129: # 3 handler functions
130: def start_element(name, attrs):
131: global ret
132: ret+=createTag(name,attrs)
133: def end_element(name):
134: global ret
135: ret+="</%s>"%(name.encode('utf-8'))
136:
137: def char_data(data):
138: global ret
139: ret+=insertW(data)
140:
141: p = xml.parsers.expat.ParserCreate()
142:
143: p.StartElementHandler = start_element
144: p.EndElementHandler = end_element
145: p.CharacterDataHandler = char_data
146:
147: p.Parse(self.getPage(nr), 1)
148: txt="""<wtag locator="xxx">
149: <section lang="%s"><s>%s</s></section>
150: </wtag>"""
151: ret=txt%(lang,ret)
152:
153: return ret
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>