File:  [Repository] / ECHO_content / analyseAndTag / analyseAndTag.py
Revision 1.4: download - view: text, annotated - select for diffs - revision graph
Tue Jan 9 17:01:01 2007 UTC (17 years, 6 months ago) by dwinter
Branches: MAIN
CVS tags: cleanup, Root_cleanup, HEAD
bug fixed in echo_xslt

    1: import os.path
    2: import os
    3: import xmlrpclib
    4: import xml.dom.minidom
    5: import urllib
    6: 
    7: from Ft.Xml.Xslt.Processor import Processor
    8: from Ft.Xml.InputSource import DefaultFactory
    9: 
   10: from Ft.Lib import Uri
   11: 
   12: def package_home(gdict): 
   13:     filename = gdict["__file__"] 
   14:     return os.path.dirname(filename)
   15: 
   16: def getTextFromNode(nodename):
   17:     nodelist=nodename.childNodes
   18:     rc = ""
   19:     for node in nodelist:
   20:         if node.nodeType == node.TEXT_NODE:
   21:            rc = rc + node.data
   22:     return rc
   23: 
   24: class DonatusFile:
   25:     def __init__(self,fileName=None,url=None,txt=None,baseUri=None):
   26:         '''
   27:     
   28:         @param fileName:path to the filename
   29:         @url fals url
   30:         '''
   31:         if fileName:
   32:             self.fileName=fileName
   33:             self.file_uri= Uri.OsPathToUri(fileName, attemptAbsolute=1)
   34:         elif url:
   35:             self.filename=self.file_uri=url
   36:         elif txt:
   37:             self.fileName="txt"
   38:             self.file_uri=None
   39:             self.txt=txt
   40:         else:
   41:            return None
   42:         self.baseUri=baseUri
   43:         
   44:     def generateWordList(self):
   45:         '''
   46:         generate wordList (wtag format for donatus)
   47:         '''
   48:         
   49:         if not hasattr(self,"wordList"):
   50:             xsltproc = Processor()
   51:             xsl_uri = Uri.OsPathToUri(os.path.join(package_home(globals()),'wordlist.xsl'), attemptAbsolute=1)
   52:             xsltproc.appendStylesheet(DefaultFactory.fromUri(xsl_uri))
   53: 
   54:             if self.file_uri:
   55:                 self.wordList = xsltproc.run(DefaultFactory.fromUri(self.file_uri))[0:]
   56:             else:
   57:                 self.wordList = xsltproc.run(DefaultFactory.fromString(self.txt))[0:]
   58:         return self.wordList
   59: 
   60:     def analyseWordList(self):
   61:         '''
   62:         wordList nach donatus
   63:         '''
   64:         try:
   65:             prssafsaf
   66:             if not hasattr(self,'analysedWordList'):
   67:                 server=xmlrpclib.ServerProxy("http://archimedes.fas.harvard.edu/cgi-bin/donatus-rpc")
   68:     
   69:                 bin=xmlrpclib.Binary(getattr(self,"wordList",self.generateWordList()))
   70:     
   71:                 ret=server.donatus.analyze(bin)
   72:                 
   73:                 self.analysedWordList=ret['morphData'].data[0:]
   74:     
   75:             return self.analysedWordList
   76:         except:
   77:             print "ERROR: cannot analyse words"
   78:             self.analyseWordList="""<?xml version="1.0"?><ERROR>cannot analyse wordlist</ERROR>"""
   79:             return self.analyseWordList
   80: 
   81:     def wordListToHash(self):
   82:         '''
   83:         wordList to hash
   84:         '''
   85:         if not hasattr(self,'words'):
   86: 
   87:             self.words={}
   88:             dom=xml.dom.minidom.parseString(getattr(self,'analysedWordist',self.analyseWordList()))
   89: 
   90:             lemmas=dom.getElementsByTagName('lemma')
   91: 
   92:             for lemma in lemmas:
   93:                 form=lemma.getAttribute('form')
   94:                 variants=lemma.getElementsByTagName('variant')
   95:                 for variant in variants:
   96:                     formV=variant.getAttribute('form')
   97:                     if self.words.has_key(formV) and not (form in self.words[formV]):
   98:                         self.words[formV].append(form)
   99:                     else:
  100:                         self.words[formV]=[form]
  101:         return self.words
  102: 
  103:     def lemmatizeFile(self):
  104:         '''
  105:         lemmatize file
  106:         '''
  107:         if not hasattr(self,'lemmatizedFile'):
  108:             xsltproc = Processor()
  109:             xsl_uri = Uri.OsPathToUri(os.path.join(package_home(globals()),'lemmatize.xsl'), attemptAbsolute=1)
  110:             xsltproc.appendStylesheet(DefaultFactory.fromUri(xsl_uri))
  111: 
  112:             if getattr(self,'file_uri',None):
  113:                 lemmatized = xsltproc.run(DefaultFactory.fromUri(self.file_uri))[0:]
  114:             else:
  115:                 lemmatized = xsltproc.run(DefaultFactory.fromString(self.txt,self.baseUri))[0:]
  116: 
  117:             self.lemmatizedFile=lemmatized
  118: 
  119:         return self.lemmatizedFile
  120:     
  121:     def addFormToWords(self):
  122:         '''
  123:         add form attributes to the words
  124:         '''
  125:         if not hasattr(self,'dom_with_attributes'):
  126:             
  127:             dom=xml.dom.minidom.parseString(getattr(self,'lemmatizedFile',self.lemmatizeFile()))
  128: 
  129:             wordNodes=dom.getElementsByTagName('mpiwg:w')
  130:             #words=getattr(self,'words',self.wordListToHash())
  131:             words=self.wordListToHash()
  132: 
  133:             for word in wordNodes:
  134:                 
  135:                 text=getTextFromNode(word)
  136:                 text=text.lstrip().rstrip()
  137: 
  138:                 if (len(text)>0) and ('.!();?[],'.find(text[-1])>-1):
  139: 
  140:                     textTmp=text[0:len(text)-1]
  141:                 else:
  142:                     textTmp=text
  143: 
  144:                 
  145: 
  146:                 if words.has_key(textTmp):
  147:                     form=words[textTmp][0]
  148:                     word.setAttribute("mpiwg:form",form)
  149:                     word.setAttribute("mpiwg:analysed","yes")
  150:                 else:
  151:                    if (textTmp!="") and (textTmp !=" "):
  152:                          word.setAttribute("mpiwg:form",textTmp)
  153:                          word.setAttribute("mpiwg:analysed","no")
  154:             self.dom_with_attributes=dom
  155:         return self.dom_with_attributes 
  156: 
  157:     def convertedXML(self):
  158:         dom=getattr(self,'dom_with_attributes',self.addFormToWords())
  159:         return dom.toxml('utf-8')
  160: 
  161:     def wordsToLinks(self):
  162:         xmlTxt=self.convertedXML()
  163: 
  164:         global retLex
  165:         global toggle
  166: 
  167:         toggle=0
  168:         retLex=""
  169:         saved_attrs={}
  170:    
  171:         def createTag(name,attrs):
  172:                 global toggle
  173:                 global saved_attrs
  174:                 if name=="mpiwg:w":
  175:                         toggle=1
  176:                         saved_attrs=attrs
  177:                         return ""
  178:                 else:
  179:                         tag="<"
  180:                         tag+=name
  181:                         for attr in attrs.keys():
  182:                                 tag+=""" %s="%s" """%(attr,attrs[attr])
  183:                         tag+=">"
  184:                 return tag
  185:                         
  186:         def createData(data):
  187:                 global toggle
  188:                 global saved_attrs
  189:                 print saved_attrs
  190:                 astring="""<a href="http://141.14.236.86/cgi-bin/toc/dict?step=remotetable;word=%s;lang=de" target="_blank">%s</a> """
  191:                 urlString="""http://141.14.236.86/cgi-bin/toc/dict?step=remotetable;word=%s;lang=de"""
  192:                 if toggle: # tag war ein w
  193:                         toggle=0
  194:                         if saved_attrs.has_key('mpiwg:form'):
  195:                             if saved_attrs['mpiwg:analysed']=='yes':
  196: 
  197:                                 return astring%(saved_attrs['mpiwg:form'],data)
  198:                             else:
  199:                                 return "<a>"+data+"</a>"
  200:                         else:
  201:                             return data
  202:            
  203:                         
  204:                                 
  205: 
  206:         # 3 handler functions
  207:         def start_element(name, attrs):
  208:                 global retLex
  209:           
  210:                 retLex+=createTag(name,attrs)
  211:         def end_element(name):
  212:                 global retLex
  213:                 if not name=="mpiwg:w":
  214:                         retLex+="</%s>"%(name.encode('utf-8'))
  215:                 
  216:             
  217:         def char_data(data):
  218:                 global retLex
  219:                 retLex+=createData(data)
  220:                 if data:
  221:                         try:
  222:                                 retLex+=createData(data)
  223:                         except:
  224:                                 """no"""
  225:                                 
  226:         p = xml.parsers.expat.ParserCreate()
  227: 
  228:         p.StartElementHandler = start_element
  229:         p.EndElementHandler = end_element
  230:         p.CharacterDataHandler = char_data
  231:         
  232:         p.Parse(xmlTxt,1)
  233:         #print repr(lemmatized.encode('utf-8'))
  234: 
  235:         return retLex
  236: 
  237: 
  238: #def convertFile(source,target):
  239: #    '''
  240: #    @param source:source directory tree
  241: #    @param target: target directory tree
  242: #    '''
  243: #    
  244: #    if not os.path.exists(target):
  245: #        os.mkdir(target)
  246: #    for root,dirs,files in os.walk(source):
  247: #    
  248: #        for dir in dirs:
  249: #    
  250: #    	        dirName=os.path.join(root,dir).replace(source,target)
  251: #    	        if not os.path.exists(dirName):
  252: #    	            os.mkdir(dirName)
  253: #    
  254: #        for name in files:
  255: #    	        fileName=os.path.join(root,name)
  256: #    	
  257: #            	if os.path.splitext(fileName)[1]==".xml":
  258: #            	    fileNameNeu=fileName.replace(source,target)
  259: #            	    print "processing",fileNameNeu
  260: #            	    fh=file(fileNameNeu,"w")
  261: #            	    try:
  262: #                		fh.write(donatusFile(fileName).convertedXML())
  263: #            	    except:
  264: #                		print "ERROR:",fileName
  265: #            	    fh.close()
  266: #
  267: #rootDir="/Users/dwinter/Diss/Quellen-primaer/Formax/Done"
  268: #rootDirNeu="/Users/dwinter/Diss/Quellen-primaer/transformed0.1"
  269: #
  270: #convertFile(rootDir,rootDirNeu)

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>