Annotation of ECHO_content/analyseAndTag/analyseAndTag.py, revision 1.3
1.1 dwinter 1: import os.path
2: import os
3: import xmlrpclib
4: import xml.dom.minidom
5: import urllib
6:
7: from Ft.Xml.Xslt.Processor import Processor
8: from Ft.Xml.InputSource import DefaultFactory
9:
10: from Ft.Lib import Uri
11:
12: def package_home(gdict):
13: filename = gdict["__file__"]
14: return os.path.dirname(filename)
15:
16: def getTextFromNode(nodename):
17: nodelist=nodename.childNodes
18: rc = ""
19: for node in nodelist:
20: if node.nodeType == node.TEXT_NODE:
21: rc = rc + node.data
22: return rc
23:
24: class DonatusFile:
1.3 ! dwinter 25: def __init__(self,fileName=None,url=None,txt=None,baseUri=None):
1.1 dwinter 26: '''
27:
28: @param fileName:path to the filename
29: @url fals url
30: '''
31: if fileName:
32: self.fileName=fileName
33: self.file_uri= Uri.OsPathToUri(fileName, attemptAbsolute=1)
34: elif url:
35: self.filename=self.file_uri=url
36: elif txt:
37: self.fileName="txt"
38: self.file_uri=None
39: self.txt=txt
40: else:
41: return None
1.3 ! dwinter 42: self.baseUri=baseUri
! 43:
1.1 dwinter 44: def generateWordList(self):
45: '''
46: generate wordList (wtag format for donatus)
47: '''
48:
49: if not hasattr(self,"wordList"):
50: xsltproc = Processor()
51: xsl_uri = Uri.OsPathToUri(os.path.join(package_home(globals()),'wordlist.xsl'), attemptAbsolute=1)
52: xsltproc.appendStylesheet(DefaultFactory.fromUri(xsl_uri))
53:
54: if self.file_uri:
55: self.wordList = xsltproc.run(DefaultFactory.fromUri(self.file_uri))[0:]
56: else:
57: self.wordList = xsltproc.run(DefaultFactory.fromString(self.txt))[0:]
58: return self.wordList
59:
60: def analyseWordList(self):
61: '''
62: wordList nach donatus
63: '''
1.2 dwinter 64: try:
65: if not hasattr(self,'analysedWordList'):
66: server=xmlrpclib.ServerProxy("http://archimedes.fas.harvard.edu/cgi-bin/donatus-rpc")
1.1 dwinter 67:
1.2 dwinter 68: bin=xmlrpclib.Binary(getattr(self,"wordList",self.generateWordList()))
69:
70: ret=server.donatus.analyze(bin)
71:
72: self.analysedWordList=ret['morphData'].data[0:]
73:
74: return self.analysedWordList
75: except:
76: print "ERROR: cannot analyse words"
77: self.analyseWordList="""<?xml version="1.0"?><ERROR>cannot analyse wordlist</ERROR>"""
78: return self.analyseWordList
1.1 dwinter 79:
80: def wordListToHash(self):
81: '''
82: wordList to hash
83: '''
84: if not hasattr(self,'words'):
85:
86: self.words={}
87: dom=xml.dom.minidom.parseString(getattr(self,'analysedWordist',self.analyseWordList()))
88:
89: lemmas=dom.getElementsByTagName('lemma')
90:
91: for lemma in lemmas:
92: form=lemma.getAttribute('form')
93: variants=lemma.getElementsByTagName('variant')
94: for variant in variants:
95: formV=variant.getAttribute('form')
96: if self.words.has_key(formV) and not (form in self.words[formV]):
97: self.words[formV].append(form)
98: else:
99: self.words[formV]=[form]
100: return self.words
101:
102: def lemmatizeFile(self):
103: '''
104: lemmatize file
105: '''
106: if not hasattr(self,'lemmatizedFile'):
107: xsltproc = Processor()
108: xsl_uri = Uri.OsPathToUri(os.path.join(package_home(globals()),'lemmatize.xsl'), attemptAbsolute=1)
109: xsltproc.appendStylesheet(DefaultFactory.fromUri(xsl_uri))
110:
1.3 ! dwinter 111: if getattr(self,'file_uri',None):
1.1 dwinter 112: lemmatized = xsltproc.run(DefaultFactory.fromUri(self.file_uri))[0:]
113: else:
1.3 ! dwinter 114: lemmatized = xsltproc.run(DefaultFactory.fromString(self.txt,self.baseUri))[0:]
1.1 dwinter 115:
116: self.lemmatizedFile=lemmatized
117:
118: return self.lemmatizedFile
119:
120: def addFormToWords(self):
121: '''
122: add form attributes to the words
123: '''
124: if not hasattr(self,'dom_with_attributes'):
1.3 ! dwinter 125:
1.1 dwinter 126: dom=xml.dom.minidom.parseString(getattr(self,'lemmatizedFile',self.lemmatizeFile()))
127:
128: wordNodes=dom.getElementsByTagName('mpiwg:w')
129: #words=getattr(self,'words',self.wordListToHash())
130: words=self.wordListToHash()
131:
132: for word in wordNodes:
133:
134: text=getTextFromNode(word)
135: text=text.lstrip().rstrip()
136:
137: if (len(text)>0) and ('.!();?[],'.find(text[-1])>-1):
138:
139: textTmp=text[0:len(text)-1]
140: else:
141: textTmp=text
142:
143:
144:
145: if words.has_key(textTmp):
146: form=words[textTmp][0]
147: word.setAttribute("mpiwg:form",form)
148: word.setAttribute("mpiwg:analysed","yes")
149: else:
150: if (textTmp!="") and (textTmp !=" "):
151: word.setAttribute("mpiwg:form",textTmp)
152: word.setAttribute("mpiwg:analysed","no")
153: self.dom_with_attributes=dom
154: return self.dom_with_attributes
155:
156: def convertedXML(self):
157: dom=getattr(self,'dom_with_attributes',self.addFormToWords())
158: return dom.toxml('utf-8')
159:
160: def wordsToLinks(self):
161: xmlTxt=self.convertedXML()
162:
163: global retLex
164: global toggle
165:
166: toggle=0
167: retLex=""
168: saved_attrs={}
169:
170: def createTag(name,attrs):
171: global toggle
172: global saved_attrs
173: if name=="mpiwg:w":
174: toggle=1
175: saved_attrs=attrs
176: return ""
177: else:
178: tag="<"
179: tag+=name
180: for attr in attrs.keys():
181: tag+=""" %s="%s" """%(attr,attrs[attr])
182: tag+=">"
183: return tag
184:
185: def createData(data):
186: global toggle
187: global saved_attrs
188: print saved_attrs
189: astring="""<a href="http://141.14.236.86/cgi-bin/toc/dict?step=remotetable;word=%s;lang=de" target="_blank">%s</a> """
190: urlString="""http://141.14.236.86/cgi-bin/toc/dict?step=remotetable;word=%s;lang=de"""
191: if toggle: # tag war ein w
192: toggle=0
193: if saved_attrs.has_key('mpiwg:form'):
194: if saved_attrs['mpiwg:analysed']=='yes':
195:
196: return astring%(saved_attrs['mpiwg:form'],data)
197: else:
198: return "<a>"+data+"</a>"
199: else:
200: return data
201:
202:
203:
204:
205: # 3 handler functions
206: def start_element(name, attrs):
207: global retLex
208:
209: retLex+=createTag(name,attrs)
210: def end_element(name):
211: global retLex
212: if not name=="mpiwg:w":
213: retLex+="</%s>"%(name.encode('utf-8'))
214:
215:
216: def char_data(data):
217: global retLex
218: retLex+=createData(data)
219: if data:
220: try:
221: retLex+=createData(data)
222: except:
223: """no"""
224:
225: p = xml.parsers.expat.ParserCreate()
226:
227: p.StartElementHandler = start_element
228: p.EndElementHandler = end_element
229: p.CharacterDataHandler = char_data
230:
231: p.Parse(xmlTxt,1)
232: #print repr(lemmatized.encode('utf-8'))
233:
234: return retLex
235:
236:
237: #def convertFile(source,target):
238: # '''
239: # @param source:source directory tree
240: # @param target: target directory tree
241: # '''
242: #
243: # if not os.path.exists(target):
244: # os.mkdir(target)
245: # for root,dirs,files in os.walk(source):
246: #
247: # for dir in dirs:
248: #
249: # dirName=os.path.join(root,dir).replace(source,target)
250: # if not os.path.exists(dirName):
251: # os.mkdir(dirName)
252: #
253: # for name in files:
254: # fileName=os.path.join(root,name)
255: #
256: # if os.path.splitext(fileName)[1]==".xml":
257: # fileNameNeu=fileName.replace(source,target)
258: # print "processing",fileNameNeu
259: # fh=file(fileNameNeu,"w")
260: # try:
261: # fh.write(donatusFile(fileName).convertedXML())
262: # except:
263: # print "ERROR:",fileName
264: # fh.close()
265: #
266: #rootDir="/Users/dwinter/Diss/Quellen-primaer/Formax/Done"
267: #rootDirNeu="/Users/dwinter/Diss/Quellen-primaer/transformed0.1"
268: #
269: #convertFile(rootDir,rootDirNeu)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>