File:  [Repository] / ECHO_content / analyseAndTag / analyseAndTag.py
Revision 1.3: download - view: text, annotated - select for diffs - revision graph
Mon Sep 11 14:43:23 2006 UTC (17 years, 10 months ago) by dwinter
Branches: MAIN
CVS tags: HEAD
verbesserung fŸr text unterstŸtzung, text kann jetzt aus url kommen, sprache bisher nur deutsch

import os.path
import os
import xmlrpclib
import xml.dom.minidom
import urllib

from Ft.Xml.Xslt.Processor import Processor
from Ft.Xml.InputSource import DefaultFactory

from Ft.Lib import Uri

def package_home(gdict): 
    filename = gdict["__file__"] 
    return os.path.dirname(filename)

def getTextFromNode(nodename):
    nodelist=nodename.childNodes
    rc = ""
    for node in nodelist:
        if node.nodeType == node.TEXT_NODE:
           rc = rc + node.data
    return rc

class DonatusFile:
    def __init__(self,fileName=None,url=None,txt=None,baseUri=None):
        '''
    
        @param fileName:path to the filename
        @url fals url
        '''
        if fileName:
            self.fileName=fileName
            self.file_uri= Uri.OsPathToUri(fileName, attemptAbsolute=1)
        elif url:
            self.filename=self.file_uri=url
        elif txt:
            self.fileName="txt"
            self.file_uri=None
            self.txt=txt
        else:
           return None
        self.baseUri=baseUri
        
    def generateWordList(self):
        '''
        generate wordList (wtag format for donatus)
        '''
        
        if not hasattr(self,"wordList"):
            xsltproc = Processor()
            xsl_uri = Uri.OsPathToUri(os.path.join(package_home(globals()),'wordlist.xsl'), attemptAbsolute=1)
            xsltproc.appendStylesheet(DefaultFactory.fromUri(xsl_uri))

            if self.file_uri:
                self.wordList = xsltproc.run(DefaultFactory.fromUri(self.file_uri))[0:]
            else:
                self.wordList = xsltproc.run(DefaultFactory.fromString(self.txt))[0:]
        return self.wordList

    def analyseWordList(self):
        '''
        wordList nach donatus
        '''
        try:
            if not hasattr(self,'analysedWordList'):
                server=xmlrpclib.ServerProxy("http://archimedes.fas.harvard.edu/cgi-bin/donatus-rpc")
    
                bin=xmlrpclib.Binary(getattr(self,"wordList",self.generateWordList()))
    
                ret=server.donatus.analyze(bin)
                
                self.analysedWordList=ret['morphData'].data[0:]
    
            return self.analysedWordList
        except:
            print "ERROR: cannot analyse words"
            self.analyseWordList="""<?xml version="1.0"?><ERROR>cannot analyse wordlist</ERROR>"""
            return self.analyseWordList

    def wordListToHash(self):
        '''
        wordList to hash
        '''
        if not hasattr(self,'words'):

            self.words={}
            dom=xml.dom.minidom.parseString(getattr(self,'analysedWordist',self.analyseWordList()))

            lemmas=dom.getElementsByTagName('lemma')

            for lemma in lemmas:
                form=lemma.getAttribute('form')
                variants=lemma.getElementsByTagName('variant')
                for variant in variants:
                    formV=variant.getAttribute('form')
                    if self.words.has_key(formV) and not (form in self.words[formV]):
                        self.words[formV].append(form)
                    else:
                        self.words[formV]=[form]
        return self.words

    def lemmatizeFile(self):
        '''
        lemmatize file
        '''
        if not hasattr(self,'lemmatizedFile'):
            xsltproc = Processor()
            xsl_uri = Uri.OsPathToUri(os.path.join(package_home(globals()),'lemmatize.xsl'), attemptAbsolute=1)
            xsltproc.appendStylesheet(DefaultFactory.fromUri(xsl_uri))

            if getattr(self,'file_uri',None):
                lemmatized = xsltproc.run(DefaultFactory.fromUri(self.file_uri))[0:]
            else:
                lemmatized = xsltproc.run(DefaultFactory.fromString(self.txt,self.baseUri))[0:]

            self.lemmatizedFile=lemmatized

        return self.lemmatizedFile
    
    def addFormToWords(self):
        '''
        add form attributes to the words
        '''
        if not hasattr(self,'dom_with_attributes'):
            
            dom=xml.dom.minidom.parseString(getattr(self,'lemmatizedFile',self.lemmatizeFile()))

            wordNodes=dom.getElementsByTagName('mpiwg:w')
            #words=getattr(self,'words',self.wordListToHash())
            words=self.wordListToHash()

            for word in wordNodes:
                
                text=getTextFromNode(word)
                text=text.lstrip().rstrip()

                if (len(text)>0) and ('.!();?[],'.find(text[-1])>-1):

                    textTmp=text[0:len(text)-1]
                else:
                    textTmp=text

                

                if words.has_key(textTmp):
                    form=words[textTmp][0]
                    word.setAttribute("mpiwg:form",form)
                    word.setAttribute("mpiwg:analysed","yes")
                else:
                   if (textTmp!="") and (textTmp !=" "):
                         word.setAttribute("mpiwg:form",textTmp)
                         word.setAttribute("mpiwg:analysed","no")
            self.dom_with_attributes=dom
        return self.dom_with_attributes 

    def convertedXML(self):
        dom=getattr(self,'dom_with_attributes',self.addFormToWords())
        return dom.toxml('utf-8')

    def wordsToLinks(self):
        xmlTxt=self.convertedXML()

        global retLex
        global toggle

        toggle=0
        retLex=""
        saved_attrs={}
   
        def createTag(name,attrs):
                global toggle
                global saved_attrs
                if name=="mpiwg:w":
                        toggle=1
                        saved_attrs=attrs
                        return ""
                else:
                        tag="<"
                        tag+=name
                        for attr in attrs.keys():
                                tag+=""" %s="%s" """%(attr,attrs[attr])
                        tag+=">"
                return tag
                        
        def createData(data):
                global toggle
                global saved_attrs
                print saved_attrs
                astring="""<a href="http://141.14.236.86/cgi-bin/toc/dict?step=remotetable;word=%s;lang=de" target="_blank">%s</a> """
                urlString="""http://141.14.236.86/cgi-bin/toc/dict?step=remotetable;word=%s;lang=de"""
                if toggle: # tag war ein w
                        toggle=0
                        if saved_attrs.has_key('mpiwg:form'):
                            if saved_attrs['mpiwg:analysed']=='yes':

                                return astring%(saved_attrs['mpiwg:form'],data)
                            else:
                                return "<a>"+data+"</a>"
                        else:
                            return data
           
                        
                                

        # 3 handler functions
        def start_element(name, attrs):
                global retLex
          
                retLex+=createTag(name,attrs)
        def end_element(name):
                global retLex
                if not name=="mpiwg:w":
                        retLex+="</%s>"%(name.encode('utf-8'))
                
            
        def char_data(data):
                global retLex
                retLex+=createData(data)
                if data:
                        try:
                                retLex+=createData(data)
                        except:
                                """no"""
                                
        p = xml.parsers.expat.ParserCreate()

        p.StartElementHandler = start_element
        p.EndElementHandler = end_element
        p.CharacterDataHandler = char_data
        
        p.Parse(xmlTxt,1)
        #print repr(lemmatized.encode('utf-8'))

        return retLex


#def convertFile(source,target):
#    '''
#    @param source:source directory tree
#    @param target: target directory tree
#    '''
#    
#    if not os.path.exists(target):
#        os.mkdir(target)
#    for root,dirs,files in os.walk(source):
#    
#        for dir in dirs:
#    
#    	        dirName=os.path.join(root,dir).replace(source,target)
#    	        if not os.path.exists(dirName):
#    	            os.mkdir(dirName)
#    
#        for name in files:
#    	        fileName=os.path.join(root,name)
#    	
#            	if os.path.splitext(fileName)[1]==".xml":
#            	    fileNameNeu=fileName.replace(source,target)
#            	    print "processing",fileNameNeu
#            	    fh=file(fileNameNeu,"w")
#            	    try:
#                		fh.write(donatusFile(fileName).convertedXML())
#            	    except:
#                		print "ERROR:",fileName
#            	    fh.close()
#
#rootDir="/Users/dwinter/Diss/Quellen-primaer/Formax/Done"
#rootDirNeu="/Users/dwinter/Diss/Quellen-primaer/transformed0.1"
#
#convertFile(rootDir,rootDirNeu)

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>