File:  [Repository] / ECHO_content / ECHO_language.py
Revision 1.3: download - view: text, annotated - select for diffs - revision graph
Wed Oct 26 08:35:53 2005 UTC (18 years, 7 months ago) by dwinter
Branches: MAIN
CVS tags: HEAD
minor

    1: """Methoden fuer Language Technologies"""
    2: 
    3: def donatus(txt2):
    4:         import xmlrpclib
    5: 
    6:         server = xmlrpclib.ServerProxy("http://archimedes.fas.harvard.edu/cgi-bin/donatus-rpc")
    7: 
    8:         txt=txt2.encode('utf-8')
    9:         bin=xmlrpclib.Binary(txt)
   10: 
   11:         
   12: 
   13:         ret=server.donatus.analyze(bin)
   14: 
   15:                 
   16:         return ret['morphData'].data
   17: 
   18: 
   19: def donatusVariant2Lemma(morphData):
   20:         """creates hash variant -> morphdata"""
   21:         ret={}
   22:         dom=xml.dom.minidom.parseString(morphData)
   23:         lemmas=dom.getElementsByTagName('lemma')
   24:         for lemma in lemmas:
   25:                 variants=lemma.getElementsByTagName('variant')
   26:                 for variant in variants:
   27:                         atr=variant.getAttribute('form')
   28:                         if ret.has_key(atr):
   29:                                 ret[atr].append=lemma.getAttribute('form')
   30:                         else:
   31:                                 ret[atr]=[lemma.getAttribute('form')]
   32: 
   33:         return ret
   34: 
   35: class ECHO_language:
   36:         """language methods"""
   37:         
   38:         def donatusVariant2Lemma(self,nr='1'):
   39:                 """analyze by donatus"""
   40:                 return donatusVariant2Lemma(donatus(self.lemmatize(nr)))
   41: 
   42:         def tagLex(self,nr="1"):
   43:                 """generate Links"""
   44:                 global retLex
   45:                 global toggle
   46: 
   47:                 toggle=0
   48:                 retLex=""
   49: 
   50:                 lemmatized=self.lemmatize(nr)[0:]
   51:                 #print "ho",repr(lemmatized)
   52:                 variants=donatusVariant2Lemma(donatus(lemmatized))
   53:                 
   54:                 def createTag(name,attrs):
   55:                         global toggle
   56:                         
   57:                         if name=="w":
   58:                                 toggle=1
   59:                                 return ""
   60:                         else:
   61:                                 tag="<"
   62:                                 tag+=name
   63:                                 for attr in attrs.keys():
   64:                                         tag+=""" %s="%s" """%(attr,attrs[attr])
   65:                                 tag+=">"
   66:                         return tag
   67:                                 
   68:                 def createData(data):
   69:                         global toggle
   70:                         astring="""<a href="http://141.14.236.86/cgi-bin/toc/dict?step=remotetable;word=%s;lang=de" target="_blank">%s</a> """
   71:                         if toggle: # tag war ein w
   72:                                 toggle=0
   73:                                 if variants.has_key(data):
   74:                                         return astring%(variants[data][0],data)
   75:                                 else:
   76:                                         return astring%(data,data)
   77:                                 
   78:                                         
   79: 
   80:                 # 3 handler functions
   81:                 def start_element(name, attrs):
   82:                         global retLex
   83:                         
   84:                         retLex+=createTag(name,attrs)
   85:                 def end_element(name):
   86:                         global retLex
   87:                         if not name=="w":
   88:                                 retLex+="</%s>"%(name.encode('utf-8'))
   89:                         
   90:                     
   91:                 def char_data(data):
   92:                         global retLex
   93:                         if data:
   94:                                 try:
   95:                                         retLex+=createData(data)
   96:                                 except:
   97:                                         """no"""
   98:                                         
   99:                 p = xml.parsers.expat.ParserCreate()
  100: 
  101:                 p.StartElementHandler = start_element
  102:                 p.EndElementHandler = end_element
  103:                 p.CharacterDataHandler = char_data
  104:                 
  105:                 p.Parse(lemmatized.encode('utf-8'),1)
  106:                 #print repr(lemmatized.encode('utf-8'))
  107:         
  108:                 return retLex
  109:         
  110:                 
  111:         def lemmatize(self,nr='1',lang="de"):
  112:                 """lemmatize"""
  113:                 global ret
  114:                 ret=""
  115:                 
  116:                 def createTag(name,attrs):
  117:                         tag="<"
  118:                         tag+=name
  119:                         for attr in attrs.keys():
  120:                                 tag+=""" %s="%s" """%(attr,attrs[attr])
  121:                         tag+=">"
  122:                         return tag
  123:                                 
  124:                 def insertW(str):
  125:                     splitted=str.split()
  126:                     wordlist=["<w>%s</w>"%split for split in splitted]
  127:                     return string.join(wordlist,'\n')
  128: 
  129:                 # 3 handler functions
  130:                 def start_element(name, attrs):
  131:                         global ret
  132:                         ret+=createTag(name,attrs)
  133:                 def end_element(name):
  134:                         global ret
  135:                         ret+="</%s>"%(name.encode('utf-8'))
  136:                     
  137:                 def char_data(data):
  138:                         global ret
  139:                         ret+=insertW(data)
  140: 
  141:                 p = xml.parsers.expat.ParserCreate()
  142: 
  143:                 p.StartElementHandler = start_element
  144:                 p.EndElementHandler = end_element
  145:                 p.CharacterDataHandler = char_data
  146: 
  147:                 p.Parse(self.getPage(nr), 1)
  148:                 txt="""<wtag locator="xxx">
  149:                 <section lang="%s"><s>%s</s></section>
  150:                 </wtag>"""
  151:                 ret=txt%(lang,ret)
  152:                 
  153:                 return ret

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>