File:  [Repository] / ECHO_content / ECHO_language.py
Revision 1.2: download - view: text, annotated - select for diffs - revision graph
Sat Nov 27 22:38:44 2004 UTC (19 years, 5 months ago) by dwinter
Branches: MAIN
CVS tags: HEAD
added query string

"""Methoden fuer Language Technologies"""

def donatus(txt2):
	import xmlrpclib

	server = xmlrpclib.ServerProxy("http://archimedes.fas.harvard.edu/cgi-bin/donatus-rpc")

	txt=txt2.encode('utf-8')
	bin=xmlrpclib.Binary(txt)

	

	ret=server.donatus.analyze(bin)

		
	return ret['morphData'].data


def donatusVariant2Lemma(morphData):
	"""creates hash variant -> morphdata"""
	ret={}
	dom=xml.dom.minidom.parseString(morphData)
	lemmas=dom.getElementsByTagName('lemma')
	for lemma in lemmas:
		variants=lemma.getElementsByTagName('variant')
		for variant in variants:
			atr=variant.getAttribute('form')
			if ret.has_key(atr):
				ret[atr].append=lemma.getAttribute('form')
			else:
				ret[atr]=[lemma.getAttribute('form')]

	return ret

class ECHO_language:
	"""language methods"""
	
	def donatusVariant2Lemma(self,nr='1'):
		"""analyze by donatus"""
		return donatusVariant2Lemma(donatus(self.lemmatize(nr)))

	def tagLex(self,nr="1"):
		"""generate Links"""
		global retLex
		global toggle

		toggle=0
		retLex=""

		lemmatized=self.lemmatize(nr)[0:]
		#print "ho",repr(lemmatized)
		variants=donatusVariant2Lemma(donatus(lemmatized))
		
		def createTag(name,attrs):
                        global toggle
			
			if name=="w":
				toggle=1
				return ""
			else:
				tag="<"
				tag+=name
				for attr in attrs.keys():
					tag+=""" %s="%s" """%(attr,attrs[attr])
				tag+=">"
			return tag
				
		def createData(data):
			global toggle
			astring="""<a href="http://141.14.236.86/cgi-bin/toc/dict?step=remotetable;word=%s;lang=de" target="_blank">%s</a> """
			if toggle: # tag war ein w
				toggle=0
				if variants.has_key(data):
					return astring%(variants[data][0],data)
				else:
					return astring%(data,data)
				
					

		# 3 handler functions
		def start_element(name, attrs):
			global retLex
			
			retLex+=createTag(name,attrs)
		def end_element(name):
			global retLex
			if not name=="w":
				retLex+="</%s>"%(name.encode('utf-8'))
			
		    
		def char_data(data):
			global retLex
			if data:
				try:
					retLex+=createData(data)
				except:
					"""no"""
					
		p = xml.parsers.expat.ParserCreate()

		p.StartElementHandler = start_element
		p.EndElementHandler = end_element
		p.CharacterDataHandler = char_data
		
		p.Parse(lemmatized.encode('utf-8'),1)
		#print repr(lemmatized.encode('utf-8'))
	
		return retLex
	
		
	def lemmatize(self,nr='1',lang="de"):
		"""lemmatize"""
		global ret
		ret=""
		
		def createTag(name,attrs):
			tag="<"
			tag+=name
			for attr in attrs.keys():
				tag+=""" %s="%s" """%(attr,attrs[attr])
			tag+=">"
			return tag
				
		def insertW(str):
		    splitted=str.split()
		    wordlist=["<w>%s</w>"%split for split in splitted]
		    return string.join(wordlist,'\n')

		# 3 handler functions
		def start_element(name, attrs):
			global ret
			ret+=createTag(name,attrs)
		def end_element(name):
			global ret
			ret+="</%s>"%(name.encode('utf-8'))
		    
		def char_data(data):
			global ret
			ret+=insertW(data)

		p = xml.parsers.expat.ParserCreate()

		p.StartElementHandler = start_element
		p.EndElementHandler = end_element
		p.CharacterDataHandler = char_data

		p.Parse(self.getPage(nr), 1)
		txt="""<wtag locator="xxx">
		<section lang="%s"><s>%s</s></section>
		</wtag>"""
		ret=txt%(lang,ret)
		
		return ret

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>