File:  [Repository] / ECHO_content / analyseAndTag / analyseAndTag.py
Revision 1.3: download - view: text, annotated - select for diffs - revision graph
Mon Sep 11 14:43:23 2006 UTC (17 years, 10 months ago) by dwinter
Branches: MAIN
CVS tags: HEAD
verbesserung fŸr text unterstŸtzung, text kann jetzt aus url kommen, sprache bisher nur deutsch

    1: import os.path
    2: import os
    3: import xmlrpclib
    4: import xml.dom.minidom
    5: import urllib
    6: 
    7: from Ft.Xml.Xslt.Processor import Processor
    8: from Ft.Xml.InputSource import DefaultFactory
    9: 
   10: from Ft.Lib import Uri
   11: 
   12: def package_home(gdict): 
   13:     filename = gdict["__file__"] 
   14:     return os.path.dirname(filename)
   15: 
   16: def getTextFromNode(nodename):
   17:     nodelist=nodename.childNodes
   18:     rc = ""
   19:     for node in nodelist:
   20:         if node.nodeType == node.TEXT_NODE:
   21:            rc = rc + node.data
   22:     return rc
   23: 
   24: class DonatusFile:
   25:     def __init__(self,fileName=None,url=None,txt=None,baseUri=None):
   26:         '''
   27:     
   28:         @param fileName:path to the filename
   29:         @url fals url
   30:         '''
   31:         if fileName:
   32:             self.fileName=fileName
   33:             self.file_uri= Uri.OsPathToUri(fileName, attemptAbsolute=1)
   34:         elif url:
   35:             self.filename=self.file_uri=url
   36:         elif txt:
   37:             self.fileName="txt"
   38:             self.file_uri=None
   39:             self.txt=txt
   40:         else:
   41:            return None
   42:         self.baseUri=baseUri
   43:         
   44:     def generateWordList(self):
   45:         '''
   46:         generate wordList (wtag format for donatus)
   47:         '''
   48:         
   49:         if not hasattr(self,"wordList"):
   50:             xsltproc = Processor()
   51:             xsl_uri = Uri.OsPathToUri(os.path.join(package_home(globals()),'wordlist.xsl'), attemptAbsolute=1)
   52:             xsltproc.appendStylesheet(DefaultFactory.fromUri(xsl_uri))
   53: 
   54:             if self.file_uri:
   55:                 self.wordList = xsltproc.run(DefaultFactory.fromUri(self.file_uri))[0:]
   56:             else:
   57:                 self.wordList = xsltproc.run(DefaultFactory.fromString(self.txt))[0:]
   58:         return self.wordList
   59: 
   60:     def analyseWordList(self):
   61:         '''
   62:         wordList nach donatus
   63:         '''
   64:         try:
   65:             if not hasattr(self,'analysedWordList'):
   66:                 server=xmlrpclib.ServerProxy("http://archimedes.fas.harvard.edu/cgi-bin/donatus-rpc")
   67:     
   68:                 bin=xmlrpclib.Binary(getattr(self,"wordList",self.generateWordList()))
   69:     
   70:                 ret=server.donatus.analyze(bin)
   71:                 
   72:                 self.analysedWordList=ret['morphData'].data[0:]
   73:     
   74:             return self.analysedWordList
   75:         except:
   76:             print "ERROR: cannot analyse words"
   77:             self.analyseWordList="""<?xml version="1.0"?><ERROR>cannot analyse wordlist</ERROR>"""
   78:             return self.analyseWordList
   79: 
   80:     def wordListToHash(self):
   81:         '''
   82:         wordList to hash
   83:         '''
   84:         if not hasattr(self,'words'):
   85: 
   86:             self.words={}
   87:             dom=xml.dom.minidom.parseString(getattr(self,'analysedWordist',self.analyseWordList()))
   88: 
   89:             lemmas=dom.getElementsByTagName('lemma')
   90: 
   91:             for lemma in lemmas:
   92:                 form=lemma.getAttribute('form')
   93:                 variants=lemma.getElementsByTagName('variant')
   94:                 for variant in variants:
   95:                     formV=variant.getAttribute('form')
   96:                     if self.words.has_key(formV) and not (form in self.words[formV]):
   97:                         self.words[formV].append(form)
   98:                     else:
   99:                         self.words[formV]=[form]
  100:         return self.words
  101: 
  102:     def lemmatizeFile(self):
  103:         '''
  104:         lemmatize file
  105:         '''
  106:         if not hasattr(self,'lemmatizedFile'):
  107:             xsltproc = Processor()
  108:             xsl_uri = Uri.OsPathToUri(os.path.join(package_home(globals()),'lemmatize.xsl'), attemptAbsolute=1)
  109:             xsltproc.appendStylesheet(DefaultFactory.fromUri(xsl_uri))
  110: 
  111:             if getattr(self,'file_uri',None):
  112:                 lemmatized = xsltproc.run(DefaultFactory.fromUri(self.file_uri))[0:]
  113:             else:
  114:                 lemmatized = xsltproc.run(DefaultFactory.fromString(self.txt,self.baseUri))[0:]
  115: 
  116:             self.lemmatizedFile=lemmatized
  117: 
  118:         return self.lemmatizedFile
  119:     
  120:     def addFormToWords(self):
  121:         '''
  122:         add form attributes to the words
  123:         '''
  124:         if not hasattr(self,'dom_with_attributes'):
  125:             
  126:             dom=xml.dom.minidom.parseString(getattr(self,'lemmatizedFile',self.lemmatizeFile()))
  127: 
  128:             wordNodes=dom.getElementsByTagName('mpiwg:w')
  129:             #words=getattr(self,'words',self.wordListToHash())
  130:             words=self.wordListToHash()
  131: 
  132:             for word in wordNodes:
  133:                 
  134:                 text=getTextFromNode(word)
  135:                 text=text.lstrip().rstrip()
  136: 
  137:                 if (len(text)>0) and ('.!();?[],'.find(text[-1])>-1):
  138: 
  139:                     textTmp=text[0:len(text)-1]
  140:                 else:
  141:                     textTmp=text
  142: 
  143:                 
  144: 
  145:                 if words.has_key(textTmp):
  146:                     form=words[textTmp][0]
  147:                     word.setAttribute("mpiwg:form",form)
  148:                     word.setAttribute("mpiwg:analysed","yes")
  149:                 else:
  150:                    if (textTmp!="") and (textTmp !=" "):
  151:                          word.setAttribute("mpiwg:form",textTmp)
  152:                          word.setAttribute("mpiwg:analysed","no")
  153:             self.dom_with_attributes=dom
  154:         return self.dom_with_attributes 
  155: 
  156:     def convertedXML(self):
  157:         dom=getattr(self,'dom_with_attributes',self.addFormToWords())
  158:         return dom.toxml('utf-8')
  159: 
  160:     def wordsToLinks(self):
  161:         xmlTxt=self.convertedXML()
  162: 
  163:         global retLex
  164:         global toggle
  165: 
  166:         toggle=0
  167:         retLex=""
  168:         saved_attrs={}
  169:    
  170:         def createTag(name,attrs):
  171:                 global toggle
  172:                 global saved_attrs
  173:                 if name=="mpiwg:w":
  174:                         toggle=1
  175:                         saved_attrs=attrs
  176:                         return ""
  177:                 else:
  178:                         tag="<"
  179:                         tag+=name
  180:                         for attr in attrs.keys():
  181:                                 tag+=""" %s="%s" """%(attr,attrs[attr])
  182:                         tag+=">"
  183:                 return tag
  184:                         
  185:         def createData(data):
  186:                 global toggle
  187:                 global saved_attrs
  188:                 print saved_attrs
  189:                 astring="""<a href="http://141.14.236.86/cgi-bin/toc/dict?step=remotetable;word=%s;lang=de" target="_blank">%s</a> """
  190:                 urlString="""http://141.14.236.86/cgi-bin/toc/dict?step=remotetable;word=%s;lang=de"""
  191:                 if toggle: # tag war ein w
  192:                         toggle=0
  193:                         if saved_attrs.has_key('mpiwg:form'):
  194:                             if saved_attrs['mpiwg:analysed']=='yes':
  195: 
  196:                                 return astring%(saved_attrs['mpiwg:form'],data)
  197:                             else:
  198:                                 return "<a>"+data+"</a>"
  199:                         else:
  200:                             return data
  201:            
  202:                         
  203:                                 
  204: 
  205:         # 3 handler functions
  206:         def start_element(name, attrs):
  207:                 global retLex
  208:           
  209:                 retLex+=createTag(name,attrs)
  210:         def end_element(name):
  211:                 global retLex
  212:                 if not name=="mpiwg:w":
  213:                         retLex+="</%s>"%(name.encode('utf-8'))
  214:                 
  215:             
  216:         def char_data(data):
  217:                 global retLex
  218:                 retLex+=createData(data)
  219:                 if data:
  220:                         try:
  221:                                 retLex+=createData(data)
  222:                         except:
  223:                                 """no"""
  224:                                 
  225:         p = xml.parsers.expat.ParserCreate()
  226: 
  227:         p.StartElementHandler = start_element
  228:         p.EndElementHandler = end_element
  229:         p.CharacterDataHandler = char_data
  230:         
  231:         p.Parse(xmlTxt,1)
  232:         #print repr(lemmatized.encode('utf-8'))
  233: 
  234:         return retLex
  235: 
  236: 
  237: #def convertFile(source,target):
  238: #    '''
  239: #    @param source:source directory tree
  240: #    @param target: target directory tree
  241: #    '''
  242: #    
  243: #    if not os.path.exists(target):
  244: #        os.mkdir(target)
  245: #    for root,dirs,files in os.walk(source):
  246: #    
  247: #        for dir in dirs:
  248: #    
  249: #    	        dirName=os.path.join(root,dir).replace(source,target)
  250: #    	        if not os.path.exists(dirName):
  251: #    	            os.mkdir(dirName)
  252: #    
  253: #        for name in files:
  254: #    	        fileName=os.path.join(root,name)
  255: #    	
  256: #            	if os.path.splitext(fileName)[1]==".xml":
  257: #            	    fileNameNeu=fileName.replace(source,target)
  258: #            	    print "processing",fileNameNeu
  259: #            	    fh=file(fileNameNeu,"w")
  260: #            	    try:
  261: #                		fh.write(donatusFile(fileName).convertedXML())
  262: #            	    except:
  263: #                		print "ERROR:",fileName
  264: #            	    fh.close()
  265: #
  266: #rootDir="/Users/dwinter/Diss/Quellen-primaer/Formax/Done"
  267: #rootDirNeu="/Users/dwinter/Diss/Quellen-primaer/transformed0.1"
  268: #
  269: #convertFile(rootDir,rootDirNeu)

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>