File:  [Repository] / ECHO_content / ECHO_language.py
Revision 1.2: download - view: text, annotated - select for diffs - revision graph
Sat Nov 27 22:38:44 2004 UTC (19 years, 7 months ago) by dwinter
Branches: MAIN
CVS tags: HEAD
added query string

    1: """Methoden fuer Language Technologies"""
    2: 
    3: def donatus(txt2):
    4: 	import xmlrpclib
    5: 
    6: 	server = xmlrpclib.ServerProxy("http://archimedes.fas.harvard.edu/cgi-bin/donatus-rpc")
    7: 
    8: 	txt=txt2.encode('utf-8')
    9: 	bin=xmlrpclib.Binary(txt)
   10: 
   11: 	
   12: 
   13: 	ret=server.donatus.analyze(bin)
   14: 
   15: 		
   16: 	return ret['morphData'].data
   17: 
   18: 
   19: def donatusVariant2Lemma(morphData):
   20: 	"""creates hash variant -> morphdata"""
   21: 	ret={}
   22: 	dom=xml.dom.minidom.parseString(morphData)
   23: 	lemmas=dom.getElementsByTagName('lemma')
   24: 	for lemma in lemmas:
   25: 		variants=lemma.getElementsByTagName('variant')
   26: 		for variant in variants:
   27: 			atr=variant.getAttribute('form')
   28: 			if ret.has_key(atr):
   29: 				ret[atr].append=lemma.getAttribute('form')
   30: 			else:
   31: 				ret[atr]=[lemma.getAttribute('form')]
   32: 
   33: 	return ret
   34: 
   35: class ECHO_language:
   36: 	"""language methods"""
   37: 	
   38: 	def donatusVariant2Lemma(self,nr='1'):
   39: 		"""analyze by donatus"""
   40: 		return donatusVariant2Lemma(donatus(self.lemmatize(nr)))
   41: 
   42: 	def tagLex(self,nr="1"):
   43: 		"""generate Links"""
   44: 		global retLex
   45: 		global toggle
   46: 
   47: 		toggle=0
   48: 		retLex=""
   49: 
   50: 		lemmatized=self.lemmatize(nr)[0:]
   51: 		#print "ho",repr(lemmatized)
   52: 		variants=donatusVariant2Lemma(donatus(lemmatized))
   53: 		
   54: 		def createTag(name,attrs):
   55:                         global toggle
   56: 			
   57: 			if name=="w":
   58: 				toggle=1
   59: 				return ""
   60: 			else:
   61: 				tag="<"
   62: 				tag+=name
   63: 				for attr in attrs.keys():
   64: 					tag+=""" %s="%s" """%(attr,attrs[attr])
   65: 				tag+=">"
   66: 			return tag
   67: 				
   68: 		def createData(data):
   69: 			global toggle
   70: 			astring="""<a href="http://141.14.236.86/cgi-bin/toc/dict?step=remotetable;word=%s;lang=de" target="_blank">%s</a> """
   71: 			if toggle: # tag war ein w
   72: 				toggle=0
   73: 				if variants.has_key(data):
   74: 					return astring%(variants[data][0],data)
   75: 				else:
   76: 					return astring%(data,data)
   77: 				
   78: 					
   79: 
   80: 		# 3 handler functions
   81: 		def start_element(name, attrs):
   82: 			global retLex
   83: 			
   84: 			retLex+=createTag(name,attrs)
   85: 		def end_element(name):
   86: 			global retLex
   87: 			if not name=="w":
   88: 				retLex+="</%s>"%(name.encode('utf-8'))
   89: 			
   90: 		    
   91: 		def char_data(data):
   92: 			global retLex
   93: 			if data:
   94: 				try:
   95: 					retLex+=createData(data)
   96: 				except:
   97: 					"""no"""
   98: 					
   99: 		p = xml.parsers.expat.ParserCreate()
  100: 
  101: 		p.StartElementHandler = start_element
  102: 		p.EndElementHandler = end_element
  103: 		p.CharacterDataHandler = char_data
  104: 		
  105: 		p.Parse(lemmatized.encode('utf-8'),1)
  106: 		#print repr(lemmatized.encode('utf-8'))
  107: 	
  108: 		return retLex
  109: 	
  110: 		
  111: 	def lemmatize(self,nr='1',lang="de"):
  112: 		"""lemmatize"""
  113: 		global ret
  114: 		ret=""
  115: 		
  116: 		def createTag(name,attrs):
  117: 			tag="<"
  118: 			tag+=name
  119: 			for attr in attrs.keys():
  120: 				tag+=""" %s="%s" """%(attr,attrs[attr])
  121: 			tag+=">"
  122: 			return tag
  123: 				
  124: 		def insertW(str):
  125: 		    splitted=str.split()
  126: 		    wordlist=["<w>%s</w>"%split for split in splitted]
  127: 		    return string.join(wordlist,'\n')
  128: 
  129: 		# 3 handler functions
  130: 		def start_element(name, attrs):
  131: 			global ret
  132: 			ret+=createTag(name,attrs)
  133: 		def end_element(name):
  134: 			global ret
  135: 			ret+="</%s>"%(name.encode('utf-8'))
  136: 		    
  137: 		def char_data(data):
  138: 			global ret
  139: 			ret+=insertW(data)
  140: 
  141: 		p = xml.parsers.expat.ParserCreate()
  142: 
  143: 		p.StartElementHandler = start_element
  144: 		p.EndElementHandler = end_element
  145: 		p.CharacterDataHandler = char_data
  146: 
  147: 		p.Parse(self.getPage(nr), 1)
  148: 		txt="""<wtag locator="xxx">
  149: 		<section lang="%s"><s>%s</s></section>
  150: 		</wtag>"""
  151: 		ret=txt%(lang,ret)
  152: 		
  153: 		return ret

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>