File:  [Repository] / ECHO_content / ECHO_language.py
Revision 1.4: download - view: text, annotated - select for diffs - revision graph
Sun Sep 10 11:03:07 2006 UTC (17 years, 8 months ago) by dwinter
Branches: MAIN
CVS tags: HEAD
new version of lexical analysis hub

"""Methoden fuer Language Technologies"""


from Products.ECHO_content.analyseAndTag.analyseAndTag import DonatusFile

import xml.parsers

def donatus(txt2):
        import xmlrpclib

        server = xmlrpclib.ServerProxy("http://archimedes.fas.harvard.edu/cgi-bin/donatus-rpc")

        txt=txt2.encode('utf-8')
        bin=xmlrpclib.Binary(txt)

        

        ret=server.donatus.analyze(bin)

                
        return ret['morphData'].data


def donatusVariant2Lemma(morphData):
        """creates hash variant -> morphdata"""
        ret={}
        dom=xml.dom.minidom.parseString(morphData)
        lemmas=dom.getElementsByTagName('lemma')
        for lemma in lemmas:
                variants=lemma.getElementsByTagName('variant')
                for variant in variants:
                        atr=variant.getAttribute('form')
                        if ret.has_key(atr):
                                ret[atr].append=lemma.getAttribute('form')
                        else:
                                ret[atr]=[lemma.getAttribute('form')]

        return ret

class ECHO_language:
        """language methods"""
        
        def donatusVariant2Lemma(self,nr='1'):
                """analyze by donatus"""
                return donatusVariant2Lemma(donatus(self.lemmatize(nr)))

        
        def tagLex(self,nr="1"):
            """gerateLinks"""
            txt=self.getPage(_pn=nr)
           
            df=DonatusFile(txt=self.getPage(_pn=nr))
           
            return df.wordsToLinks()
            #return DonatusFile(txt=self.getPage(_pn=nr)).convertedXML()
        
        def tagLex_old(self,nr="1"):
                """generate Links"""
                global retLex
                global toggle

                toggle=0
                retLex=""

                lemmatized=self.lemmatize(nr)[0:]
                #print "ho",repr(lemmatized)
                variants=donatusVariant2Lemma(donatus(lemmatized))
                
                def createTag(name,attrs):
                        global toggle
                        
                        if name=="w":
                                toggle=1
                                return ""
                        else:
                                tag="<"
                                tag+=name
                                for attr in attrs.keys():
                                        tag+=""" %s="%s" """%(attr,attrs[attr])
                                tag+=">"
                        return tag
                                
                def createData(data):
                        global toggle
                        astring="""<a href="http://141.14.236.86/cgi-bin/toc/dict?step=remotetable;word=%s;lang=de" target="_blank">%s</a> """
                        if toggle: # tag war ein w
                                toggle=0
                                if variants.has_key(data):
                                        return astring%(variants[data][0],data)
                                else:
                                        return astring%(data,data)
                                
                                        

                # 3 handler functions
                def start_element(name, attrs):
                        global retLex
                        
                        retLex+=createTag(name,attrs)
                def end_element(name):
                        global retLex
                        if not name=="w":
                                retLex+="</%s>"%(name.encode('utf-8'))
                        
                    
                def char_data(data):
                        global retLex
                        if data:
                                try:
                                        retLex+=createData(data)
                                except:
                                        """no"""
                                        
                p = xml.parsers.expat.ParserCreate()

                p.StartElementHandler = start_element
                p.EndElementHandler = end_element
                p.CharacterDataHandler = char_data
                
                p.Parse(lemmatized.encode('utf-8'),1)
                #print repr(lemmatized.encode('utf-8'))
        
                return retLex
        
                
        def lemmatize(self,nr='1',lang="de"):
                """lemmatize"""
                global ret
                ret=""
                
                def createTag(name,attrs):
                        tag="<"
                        tag+=name
                        for attr in attrs.keys():
                                tag+=""" %s="%s" """%(attr,attrs[attr])
                        tag+=">"
                        return tag
                                
                def insertW(str):
                    splitted=str.split()
                    wordlist=["<w>%s</w>"%split for split in splitted]
                    return "\n".join(wordlist)

                # 3 handler functions
                def start_element(name, attrs):
                        global ret
                        ret+=createTag(name,attrs)
                def end_element(name):
                        global ret
                        ret+="</%s>"%(name.encode('utf-8'))
                    
                def char_data(data):
                        global ret
                        ret+=insertW(data)

                p = xml.parsers.expat.ParserCreate()

                p.StartElementHandler = start_element
                p.EndElementHandler = end_element
                p.CharacterDataHandler = char_data

                p.Parse(self.getPage(nr), 1)
                txt="""<wtag locator="xxx">
                <section lang="%s"><s>%s</s></section>
                </wtag>"""
                ret=txt%(lang,ret)
                
                return ret

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>