File:  [Repository] / ECHO_content / ECHO_language.py
Revision 1.4: download - view: text, annotated - select for diffs - revision graph
Sun Sep 10 11:03:07 2006 UTC (17 years, 10 months ago) by dwinter
Branches: MAIN
CVS tags: HEAD
new version of lexical analysis hub

    1: """Methoden fuer Language Technologies"""
    2: 
    3: 
    4: from Products.ECHO_content.analyseAndTag.analyseAndTag import DonatusFile
    5: 
    6: import xml.parsers
    7: 
    8: def donatus(txt2):
    9:         import xmlrpclib
   10: 
   11:         server = xmlrpclib.ServerProxy("http://archimedes.fas.harvard.edu/cgi-bin/donatus-rpc")
   12: 
   13:         txt=txt2.encode('utf-8')
   14:         bin=xmlrpclib.Binary(txt)
   15: 
   16:         
   17: 
   18:         ret=server.donatus.analyze(bin)
   19: 
   20:                 
   21:         return ret['morphData'].data
   22: 
   23: 
   24: def donatusVariant2Lemma(morphData):
   25:         """creates hash variant -> morphdata"""
   26:         ret={}
   27:         dom=xml.dom.minidom.parseString(morphData)
   28:         lemmas=dom.getElementsByTagName('lemma')
   29:         for lemma in lemmas:
   30:                 variants=lemma.getElementsByTagName('variant')
   31:                 for variant in variants:
   32:                         atr=variant.getAttribute('form')
   33:                         if ret.has_key(atr):
   34:                                 ret[atr].append=lemma.getAttribute('form')
   35:                         else:
   36:                                 ret[atr]=[lemma.getAttribute('form')]
   37: 
   38:         return ret
   39: 
   40: class ECHO_language:
   41:         """language methods"""
   42:         
   43:         def donatusVariant2Lemma(self,nr='1'):
   44:                 """analyze by donatus"""
   45:                 return donatusVariant2Lemma(donatus(self.lemmatize(nr)))
   46: 
   47:         
   48:         def tagLex(self,nr="1"):
   49:             """gerateLinks"""
   50:             txt=self.getPage(_pn=nr)
   51:            
   52:             df=DonatusFile(txt=self.getPage(_pn=nr))
   53:            
   54:             return df.wordsToLinks()
   55:             #return DonatusFile(txt=self.getPage(_pn=nr)).convertedXML()
   56:         
   57:         def tagLex_old(self,nr="1"):
   58:                 """generate Links"""
   59:                 global retLex
   60:                 global toggle
   61: 
   62:                 toggle=0
   63:                 retLex=""
   64: 
   65:                 lemmatized=self.lemmatize(nr)[0:]
   66:                 #print "ho",repr(lemmatized)
   67:                 variants=donatusVariant2Lemma(donatus(lemmatized))
   68:                 
   69:                 def createTag(name,attrs):
   70:                         global toggle
   71:                         
   72:                         if name=="w":
   73:                                 toggle=1
   74:                                 return ""
   75:                         else:
   76:                                 tag="<"
   77:                                 tag+=name
   78:                                 for attr in attrs.keys():
   79:                                         tag+=""" %s="%s" """%(attr,attrs[attr])
   80:                                 tag+=">"
   81:                         return tag
   82:                                 
   83:                 def createData(data):
   84:                         global toggle
   85:                         astring="""<a href="http://141.14.236.86/cgi-bin/toc/dict?step=remotetable;word=%s;lang=de" target="_blank">%s</a> """
   86:                         if toggle: # tag war ein w
   87:                                 toggle=0
   88:                                 if variants.has_key(data):
   89:                                         return astring%(variants[data][0],data)
   90:                                 else:
   91:                                         return astring%(data,data)
   92:                                 
   93:                                         
   94: 
   95:                 # 3 handler functions
   96:                 def start_element(name, attrs):
   97:                         global retLex
   98:                         
   99:                         retLex+=createTag(name,attrs)
  100:                 def end_element(name):
  101:                         global retLex
  102:                         if not name=="w":
  103:                                 retLex+="</%s>"%(name.encode('utf-8'))
  104:                         
  105:                     
  106:                 def char_data(data):
  107:                         global retLex
  108:                         if data:
  109:                                 try:
  110:                                         retLex+=createData(data)
  111:                                 except:
  112:                                         """no"""
  113:                                         
  114:                 p = xml.parsers.expat.ParserCreate()
  115: 
  116:                 p.StartElementHandler = start_element
  117:                 p.EndElementHandler = end_element
  118:                 p.CharacterDataHandler = char_data
  119:                 
  120:                 p.Parse(lemmatized.encode('utf-8'),1)
  121:                 #print repr(lemmatized.encode('utf-8'))
  122:         
  123:                 return retLex
  124:         
  125:                 
  126:         def lemmatize(self,nr='1',lang="de"):
  127:                 """lemmatize"""
  128:                 global ret
  129:                 ret=""
  130:                 
  131:                 def createTag(name,attrs):
  132:                         tag="<"
  133:                         tag+=name
  134:                         for attr in attrs.keys():
  135:                                 tag+=""" %s="%s" """%(attr,attrs[attr])
  136:                         tag+=">"
  137:                         return tag
  138:                                 
  139:                 def insertW(str):
  140:                     splitted=str.split()
  141:                     wordlist=["<w>%s</w>"%split for split in splitted]
  142:                     return "\n".join(wordlist)
  143: 
  144:                 # 3 handler functions
  145:                 def start_element(name, attrs):
  146:                         global ret
  147:                         ret+=createTag(name,attrs)
  148:                 def end_element(name):
  149:                         global ret
  150:                         ret+="</%s>"%(name.encode('utf-8'))
  151:                     
  152:                 def char_data(data):
  153:                         global ret
  154:                         ret+=insertW(data)
  155: 
  156:                 p = xml.parsers.expat.ParserCreate()
  157: 
  158:                 p.StartElementHandler = start_element
  159:                 p.EndElementHandler = end_element
  160:                 p.CharacterDataHandler = char_data
  161: 
  162:                 p.Parse(self.getPage(nr), 1)
  163:                 txt="""<wtag locator="xxx">
  164:                 <section lang="%s"><s>%s</s></section>
  165:                 </wtag>"""
  166:                 ret=txt%(lang,ret)
  167:                 
  168:                 return ret

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>