![]() ![]() | ![]() |
1.1 dwinter 1: import os.path
2: import os
3: import xmlrpclib
4: import xml.dom.minidom
5: import urllib
6:
7: from Ft.Xml.Xslt.Processor import Processor
8: from Ft.Xml.InputSource import DefaultFactory
9:
10: from Ft.Lib import Uri
11:
12: def package_home(gdict):
13: filename = gdict["__file__"]
14: return os.path.dirname(filename)
15:
16: def getTextFromNode(nodename):
17: nodelist=nodename.childNodes
18: rc = ""
19: for node in nodelist:
20: if node.nodeType == node.TEXT_NODE:
21: rc = rc + node.data
22: return rc
23:
24: class DonatusFile:
1.3 dwinter 25: def __init__(self,fileName=None,url=None,txt=None,baseUri=None):
1.1 dwinter 26: '''
27:
28: @param fileName:path to the filename
29: @url fals url
30: '''
31: if fileName:
32: self.fileName=fileName
33: self.file_uri= Uri.OsPathToUri(fileName, attemptAbsolute=1)
34: elif url:
35: self.filename=self.file_uri=url
36: elif txt:
37: self.fileName="txt"
38: self.file_uri=None
39: self.txt=txt
40: else:
41: return None
1.3 dwinter 42: self.baseUri=baseUri
43:
1.1 dwinter 44: def generateWordList(self):
45: '''
46: generate wordList (wtag format for donatus)
47: '''
48:
49: if not hasattr(self,"wordList"):
50: xsltproc = Processor()
51: xsl_uri = Uri.OsPathToUri(os.path.join(package_home(globals()),'wordlist.xsl'), attemptAbsolute=1)
52: xsltproc.appendStylesheet(DefaultFactory.fromUri(xsl_uri))
53:
54: if self.file_uri:
55: self.wordList = xsltproc.run(DefaultFactory.fromUri(self.file_uri))[0:]
56: else:
57: self.wordList = xsltproc.run(DefaultFactory.fromString(self.txt))[0:]
58: return self.wordList
59:
60: def analyseWordList(self):
61: '''
62: wordList nach donatus
63: '''
1.2 dwinter 64: try:
1.4 ! dwinter 65: prssafsaf
1.2 dwinter 66: if not hasattr(self,'analysedWordList'):
67: server=xmlrpclib.ServerProxy("http://archimedes.fas.harvard.edu/cgi-bin/donatus-rpc")
1.1 dwinter 68:
1.2 dwinter 69: bin=xmlrpclib.Binary(getattr(self,"wordList",self.generateWordList()))
70:
71: ret=server.donatus.analyze(bin)
72:
73: self.analysedWordList=ret['morphData'].data[0:]
74:
75: return self.analysedWordList
76: except:
77: print "ERROR: cannot analyse words"
78: self.analyseWordList="""<?xml version="1.0"?><ERROR>cannot analyse wordlist</ERROR>"""
79: return self.analyseWordList
1.1 dwinter 80:
81: def wordListToHash(self):
82: '''
83: wordList to hash
84: '''
85: if not hasattr(self,'words'):
86:
87: self.words={}
88: dom=xml.dom.minidom.parseString(getattr(self,'analysedWordist',self.analyseWordList()))
89:
90: lemmas=dom.getElementsByTagName('lemma')
91:
92: for lemma in lemmas:
93: form=lemma.getAttribute('form')
94: variants=lemma.getElementsByTagName('variant')
95: for variant in variants:
96: formV=variant.getAttribute('form')
97: if self.words.has_key(formV) and not (form in self.words[formV]):
98: self.words[formV].append(form)
99: else:
100: self.words[formV]=[form]
101: return self.words
102:
103: def lemmatizeFile(self):
104: '''
105: lemmatize file
106: '''
107: if not hasattr(self,'lemmatizedFile'):
108: xsltproc = Processor()
109: xsl_uri = Uri.OsPathToUri(os.path.join(package_home(globals()),'lemmatize.xsl'), attemptAbsolute=1)
110: xsltproc.appendStylesheet(DefaultFactory.fromUri(xsl_uri))
111:
1.3 dwinter 112: if getattr(self,'file_uri',None):
1.1 dwinter 113: lemmatized = xsltproc.run(DefaultFactory.fromUri(self.file_uri))[0:]
114: else:
1.3 dwinter 115: lemmatized = xsltproc.run(DefaultFactory.fromString(self.txt,self.baseUri))[0:]
1.1 dwinter 116:
117: self.lemmatizedFile=lemmatized
118:
119: return self.lemmatizedFile
120:
121: def addFormToWords(self):
122: '''
123: add form attributes to the words
124: '''
125: if not hasattr(self,'dom_with_attributes'):
1.3 dwinter 126:
1.1 dwinter 127: dom=xml.dom.minidom.parseString(getattr(self,'lemmatizedFile',self.lemmatizeFile()))
128:
129: wordNodes=dom.getElementsByTagName('mpiwg:w')
130: #words=getattr(self,'words',self.wordListToHash())
131: words=self.wordListToHash()
132:
133: for word in wordNodes:
134:
135: text=getTextFromNode(word)
136: text=text.lstrip().rstrip()
137:
138: if (len(text)>0) and ('.!();?[],'.find(text[-1])>-1):
139:
140: textTmp=text[0:len(text)-1]
141: else:
142: textTmp=text
143:
144:
145:
146: if words.has_key(textTmp):
147: form=words[textTmp][0]
148: word.setAttribute("mpiwg:form",form)
149: word.setAttribute("mpiwg:analysed","yes")
150: else:
151: if (textTmp!="") and (textTmp !=" "):
152: word.setAttribute("mpiwg:form",textTmp)
153: word.setAttribute("mpiwg:analysed","no")
154: self.dom_with_attributes=dom
155: return self.dom_with_attributes
156:
157: def convertedXML(self):
158: dom=getattr(self,'dom_with_attributes',self.addFormToWords())
159: return dom.toxml('utf-8')
160:
161: def wordsToLinks(self):
162: xmlTxt=self.convertedXML()
163:
164: global retLex
165: global toggle
166:
167: toggle=0
168: retLex=""
169: saved_attrs={}
170:
171: def createTag(name,attrs):
172: global toggle
173: global saved_attrs
174: if name=="mpiwg:w":
175: toggle=1
176: saved_attrs=attrs
177: return ""
178: else:
179: tag="<"
180: tag+=name
181: for attr in attrs.keys():
182: tag+=""" %s="%s" """%(attr,attrs[attr])
183: tag+=">"
184: return tag
185:
186: def createData(data):
187: global toggle
188: global saved_attrs
189: print saved_attrs
190: astring="""<a href="http://141.14.236.86/cgi-bin/toc/dict?step=remotetable;word=%s;lang=de" target="_blank">%s</a> """
191: urlString="""http://141.14.236.86/cgi-bin/toc/dict?step=remotetable;word=%s;lang=de"""
192: if toggle: # tag war ein w
193: toggle=0
194: if saved_attrs.has_key('mpiwg:form'):
195: if saved_attrs['mpiwg:analysed']=='yes':
196:
197: return astring%(saved_attrs['mpiwg:form'],data)
198: else:
199: return "<a>"+data+"</a>"
200: else:
201: return data
202:
203:
204:
205:
206: # 3 handler functions
207: def start_element(name, attrs):
208: global retLex
209:
210: retLex+=createTag(name,attrs)
211: def end_element(name):
212: global retLex
213: if not name=="mpiwg:w":
214: retLex+="</%s>"%(name.encode('utf-8'))
215:
216:
217: def char_data(data):
218: global retLex
219: retLex+=createData(data)
220: if data:
221: try:
222: retLex+=createData(data)
223: except:
224: """no"""
225:
226: p = xml.parsers.expat.ParserCreate()
227:
228: p.StartElementHandler = start_element
229: p.EndElementHandler = end_element
230: p.CharacterDataHandler = char_data
231:
232: p.Parse(xmlTxt,1)
233: #print repr(lemmatized.encode('utf-8'))
234:
235: return retLex
236:
237:
238: #def convertFile(source,target):
239: # '''
240: # @param source:source directory tree
241: # @param target: target directory tree
242: # '''
243: #
244: # if not os.path.exists(target):
245: # os.mkdir(target)
246: # for root,dirs,files in os.walk(source):
247: #
248: # for dir in dirs:
249: #
250: # dirName=os.path.join(root,dir).replace(source,target)
251: # if not os.path.exists(dirName):
252: # os.mkdir(dirName)
253: #
254: # for name in files:
255: # fileName=os.path.join(root,name)
256: #
257: # if os.path.splitext(fileName)[1]==".xml":
258: # fileNameNeu=fileName.replace(source,target)
259: # print "processing",fileNameNeu
260: # fh=file(fileNameNeu,"w")
261: # try:
262: # fh.write(donatusFile(fileName).convertedXML())
263: # except:
264: # print "ERROR:",fileName
265: # fh.close()
266: #
267: #rootDir="/Users/dwinter/Diss/Quellen-primaer/Formax/Done"
268: #rootDirNeu="/Users/dwinter/Diss/Quellen-primaer/transformed0.1"
269: #
270: #convertFile(rootDir,rootDirNeu)