Annotation of ECHO_content/analyseAndTag/analyseAndTag.py, revision 1.1
1.1 ! dwinter 1: import os.path
! 2: import os
! 3: import xmlrpclib
! 4: import xml.dom.minidom
! 5: import urllib
! 6:
! 7: from Ft.Xml.Xslt.Processor import Processor
! 8: from Ft.Xml.InputSource import DefaultFactory
! 9:
! 10: from Ft.Lib import Uri
! 11:
! 12: def package_home(gdict):
! 13: filename = gdict["__file__"]
! 14: return os.path.dirname(filename)
! 15:
! 16: def getTextFromNode(nodename):
! 17: nodelist=nodename.childNodes
! 18: rc = ""
! 19: for node in nodelist:
! 20: if node.nodeType == node.TEXT_NODE:
! 21: rc = rc + node.data
! 22: return rc
! 23:
! 24: class DonatusFile:
! 25: def __init__(self,fileName=None,url=None,txt=None):
! 26: '''
! 27:
! 28: @param fileName:path to the filename
! 29: @url fals url
! 30: '''
! 31: if fileName:
! 32: self.fileName=fileName
! 33: self.file_uri= Uri.OsPathToUri(fileName, attemptAbsolute=1)
! 34: elif url:
! 35: self.filename=self.file_uri=url
! 36: elif txt:
! 37: self.fileName="txt"
! 38: self.file_uri=None
! 39: self.txt=txt
! 40: else:
! 41: return None
! 42:
! 43: def generateWordList(self):
! 44: '''
! 45: generate wordList (wtag format for donatus)
! 46: '''
! 47:
! 48: if not hasattr(self,"wordList"):
! 49: xsltproc = Processor()
! 50: xsl_uri = Uri.OsPathToUri(os.path.join(package_home(globals()),'wordlist.xsl'), attemptAbsolute=1)
! 51: xsltproc.appendStylesheet(DefaultFactory.fromUri(xsl_uri))
! 52:
! 53: if self.file_uri:
! 54: self.wordList = xsltproc.run(DefaultFactory.fromUri(self.file_uri))[0:]
! 55: else:
! 56: self.wordList = xsltproc.run(DefaultFactory.fromString(self.txt))[0:]
! 57: return self.wordList
! 58:
! 59: def analyseWordList(self):
! 60: '''
! 61: wordList nach donatus
! 62: '''
! 63:
! 64: if not hasattr(self,'analysedWordList'):
! 65: server=xmlrpclib.ServerProxy("http://archimedes.fas.harvard.edu/cgi-bin/donatus-rpc")
! 66:
! 67: bin=xmlrpclib.Binary(getattr(self,"wordList",self.generateWordList()))
! 68:
! 69: ret=server.donatus.analyze(bin)
! 70:
! 71: self.analysedWordList=ret['morphData'].data[0:]
! 72:
! 73: return self.analysedWordList
! 74:
! 75: def wordListToHash(self):
! 76: '''
! 77: wordList to hash
! 78: '''
! 79: if not hasattr(self,'words'):
! 80:
! 81: self.words={}
! 82: dom=xml.dom.minidom.parseString(getattr(self,'analysedWordist',self.analyseWordList()))
! 83:
! 84: lemmas=dom.getElementsByTagName('lemma')
! 85:
! 86: for lemma in lemmas:
! 87: form=lemma.getAttribute('form')
! 88: variants=lemma.getElementsByTagName('variant')
! 89: for variant in variants:
! 90: formV=variant.getAttribute('form')
! 91: if self.words.has_key(formV) and not (form in self.words[formV]):
! 92: self.words[formV].append(form)
! 93: else:
! 94: self.words[formV]=[form]
! 95: return self.words
! 96:
! 97: def lemmatizeFile(self):
! 98: '''
! 99: lemmatize file
! 100: '''
! 101: if not hasattr(self,'lemmatizedFile'):
! 102: xsltproc = Processor()
! 103: xsl_uri = Uri.OsPathToUri(os.path.join(package_home(globals()),'lemmatize.xsl'), attemptAbsolute=1)
! 104: xsltproc.appendStylesheet(DefaultFactory.fromUri(xsl_uri))
! 105:
! 106: if self.file_uri:
! 107: lemmatized = xsltproc.run(DefaultFactory.fromUri(self.file_uri))[0:]
! 108: else:
! 109: lemmatized = xsltproc.run(DefaultFactory.fromString(self.txt))[0:]
! 110:
! 111: self.lemmatizedFile=lemmatized
! 112:
! 113: return self.lemmatizedFile
! 114:
! 115: def addFormToWords(self):
! 116: '''
! 117: add form attributes to the words
! 118: '''
! 119: if not hasattr(self,'dom_with_attributes'):
! 120: dom=xml.dom.minidom.parseString(getattr(self,'lemmatizedFile',self.lemmatizeFile()))
! 121:
! 122: wordNodes=dom.getElementsByTagName('mpiwg:w')
! 123: #words=getattr(self,'words',self.wordListToHash())
! 124: words=self.wordListToHash()
! 125:
! 126: for word in wordNodes:
! 127:
! 128: text=getTextFromNode(word)
! 129: text=text.lstrip().rstrip()
! 130:
! 131: if (len(text)>0) and ('.!();?[],'.find(text[-1])>-1):
! 132:
! 133: textTmp=text[0:len(text)-1]
! 134: else:
! 135: textTmp=text
! 136:
! 137:
! 138:
! 139: if words.has_key(textTmp):
! 140: form=words[textTmp][0]
! 141: word.setAttribute("mpiwg:form",form)
! 142: word.setAttribute("mpiwg:analysed","yes")
! 143: else:
! 144: if (textTmp!="") and (textTmp !=" "):
! 145: word.setAttribute("mpiwg:form",textTmp)
! 146: word.setAttribute("mpiwg:analysed","no")
! 147: self.dom_with_attributes=dom
! 148: return self.dom_with_attributes
! 149:
! 150: def convertedXML(self):
! 151: dom=getattr(self,'dom_with_attributes',self.addFormToWords())
! 152: return dom.toxml('utf-8')
! 153:
! 154: def wordsToLinks(self):
! 155: xmlTxt=self.convertedXML()
! 156:
! 157: global retLex
! 158: global toggle
! 159:
! 160: toggle=0
! 161: retLex=""
! 162: saved_attrs={}
! 163:
! 164: def createTag(name,attrs):
! 165: global toggle
! 166: global saved_attrs
! 167: if name=="mpiwg:w":
! 168: toggle=1
! 169: saved_attrs=attrs
! 170: return ""
! 171: else:
! 172: tag="<"
! 173: tag+=name
! 174: for attr in attrs.keys():
! 175: tag+=""" %s="%s" """%(attr,attrs[attr])
! 176: tag+=">"
! 177: return tag
! 178:
! 179: def createData(data):
! 180: global toggle
! 181: global saved_attrs
! 182: print saved_attrs
! 183: astring="""<a href="http://141.14.236.86/cgi-bin/toc/dict?step=remotetable;word=%s;lang=de" target="_blank">%s</a> """
! 184: urlString="""http://141.14.236.86/cgi-bin/toc/dict?step=remotetable;word=%s;lang=de"""
! 185: if toggle: # tag war ein w
! 186: toggle=0
! 187: if saved_attrs.has_key('mpiwg:form'):
! 188: if saved_attrs['mpiwg:analysed']=='yes':
! 189:
! 190: return astring%(saved_attrs['mpiwg:form'],data)
! 191: else:
! 192: return "<a>"+data+"</a>"
! 193: else:
! 194: return data
! 195:
! 196:
! 197:
! 198:
! 199: # 3 handler functions
! 200: def start_element(name, attrs):
! 201: global retLex
! 202:
! 203: retLex+=createTag(name,attrs)
! 204: def end_element(name):
! 205: global retLex
! 206: if not name=="mpiwg:w":
! 207: retLex+="</%s>"%(name.encode('utf-8'))
! 208:
! 209:
! 210: def char_data(data):
! 211: global retLex
! 212: retLex+=createData(data)
! 213: if data:
! 214: try:
! 215: retLex+=createData(data)
! 216: except:
! 217: """no"""
! 218:
! 219: p = xml.parsers.expat.ParserCreate()
! 220:
! 221: p.StartElementHandler = start_element
! 222: p.EndElementHandler = end_element
! 223: p.CharacterDataHandler = char_data
! 224:
! 225: p.Parse(xmlTxt,1)
! 226: #print repr(lemmatized.encode('utf-8'))
! 227:
! 228: return retLex
! 229:
! 230:
! 231: #def convertFile(source,target):
! 232: # '''
! 233: # @param source:source directory tree
! 234: # @param target: target directory tree
! 235: # '''
! 236: #
! 237: # if not os.path.exists(target):
! 238: # os.mkdir(target)
! 239: # for root,dirs,files in os.walk(source):
! 240: #
! 241: # for dir in dirs:
! 242: #
! 243: # dirName=os.path.join(root,dir).replace(source,target)
! 244: # if not os.path.exists(dirName):
! 245: # os.mkdir(dirName)
! 246: #
! 247: # for name in files:
! 248: # fileName=os.path.join(root,name)
! 249: #
! 250: # if os.path.splitext(fileName)[1]==".xml":
! 251: # fileNameNeu=fileName.replace(source,target)
! 252: # print "processing",fileNameNeu
! 253: # fh=file(fileNameNeu,"w")
! 254: # try:
! 255: # fh.write(donatusFile(fileName).convertedXML())
! 256: # except:
! 257: # print "ERROR:",fileName
! 258: # fh.close()
! 259: #
! 260: #rootDir="/Users/dwinter/Diss/Quellen-primaer/Formax/Done"
! 261: #rootDirNeu="/Users/dwinter/Diss/Quellen-primaer/transformed0.1"
! 262: #
! 263: #convertFile(rootDir,rootDirNeu)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>