version 1.3, 2005/10/26 08:35:53
|
version 1.10, 2006/10/18 14:42:33
|
Line 1
|
Line 1
|
"""Methoden fuer Language Technologies""" |
"""Methoden fuer Language Technologies""" |
|
|
def donatus(txt2): |
from Products.PageTemplates.PageTemplateFile import PageTemplateFile |
import xmlrpclib |
from Products.ECHO_content.analyseAndTag.analyseAndTag import DonatusFile |
|
from OFS.SimpleItem import SimpleItem |
|
from OFS.Folder import Folder |
|
import xml.parsers |
|
import os.path |
|
import urlparse,urllib |
|
from Globals import package_home |
|
|
server = xmlrpclib.ServerProxy("http://archimedes.fas.harvard.edu/cgi-bin/donatus-rpc") |
class ECHO_language: |
|
"""language methods""" |
|
|
txt=txt2.encode('utf-8') |
|
bin=xmlrpclib.Binary(txt) |
|
|
|
|
def tagLex(self,nr="1",id=None): |
|
"""gerateword tags""" |
|
|
|
|
ret=server.donatus.analyze(bin) |
df=DonatusFile(txt=self.getPage(_pn=nr,_id=id),baseUri=self.baseUri) |
|
|
|
return df.convertedXML() |
|
#return DonatusFile(txt=self.getPage(_pn=nr)).convertedXML() |
|
|
return ret['morphData'].data |
class Collection(SimpleItem): |
|
def getCollectionXML(self,RESPONSE=None): |
|
|
|
"""get collection as xml""" |
|
return self.aq_parent.getCollectionXML(collection=self.getId(),RESPONSE=RESPONSE) |
|
|
def donatusVariant2Lemma(morphData): |
def __init__(self,id): |
"""creates hash variant -> morphdata""" |
"""initialise""" |
ret={} |
self.id=id |
dom=xml.dom.minidom.parseString(morphData) |
self.entries=[] |
lemmas=dom.getElementsByTagName('lemma') |
|
for lemma in lemmas: |
|
variants=lemma.getElementsByTagName('variant') |
|
for variant in variants: |
|
atr=variant.getAttribute('form') |
|
if ret.has_key(atr): |
|
ret[atr].append=lemma.getAttribute('form') |
|
else: |
|
ret[atr]=[lemma.getAttribute('form')] |
|
|
|
return ret |
|
|
|
class ECHO_language: |
def getEntries(self): |
"""language methods""" |
"""get the entries""" |
|
entries=self.entries |
|
for entry in entries: #backward compatibility, cannot be removed a.s.a.p. |
|
print entry |
|
if entry.has_key('master') and (len(entry['master'])<3): |
|
entry['master']=(entry['master'][0],entry['master'][1],'') |
|
if entry.has_key('slave') and (len(entry['slave'])<3): |
|
entry['slave']=(entry['slave'][0],entry['slave'][1],'') |
|
|
def donatusVariant2Lemma(self,nr='1'): |
return self.entries |
"""analyze by donatus""" |
|
return donatusVariant2Lemma(donatus(self.lemmatize(nr))) |
def deleteEntry(self,nr): |
|
"""delete an entry""" |
def tagLex(self,nr="1"): |
del(self.entries[nr]) |
"""generate Links""" |
|
global retLex |
def changeEntry(self,nr,slaveUrl): |
global toggle |
"""change an entry, only slaveUrl""" |
|
tmp=self.entries[nr] |
toggle=0 |
tmp['slave']=(slaveUrl,"","") |
retLex="" |
|
|
entries=self.entries[0:] |
lemmatized=self.lemmatize(nr)[0:] |
entries[nr]=tmp |
#print "ho",repr(lemmatized) |
self.entries=entries[0:] |
variants=donatusVariant2Lemma(donatus(lemmatized)) |
|
|
def appendEntry(self,fn,id,type,pagelink): |
def createTag(name,attrs): |
"""append an entry""" |
global toggle |
#check if last entry is complete |
|
createNew=False |
if name=="w": |
|
toggle=1 |
if len(self.entries)==0: #noch gar kein Eintrag |
return "" |
createNew=True |
else: |
|
tag="<" |
|
tag+=name |
|
for attr in attrs.keys(): |
|
tag+=""" %s="%s" """%(attr,attrs[attr]) |
|
tag+=">" |
|
return tag |
|
|
|
def createData(data): |
|
global toggle |
|
astring="""<a href="http://141.14.236.86/cgi-bin/toc/dict?step=remotetable;word=%s;lang=de" target="_blank">%s</a> """ |
|
if toggle: # tag war ein w |
|
toggle=0 |
|
if variants.has_key(data): |
|
return astring%(variants[data][0],data) |
|
else: |
else: |
return astring%(data,data) |
entry=self.entries[-1] |
|
if entry.get('master',None) and entry.get('slave',None): |
|
createNew=True |
|
if createNew: |
|
|
|
self.entries.append({}) |
|
entry=self.entries[-1] |
|
if type=="master": |
|
entry['master']=(fn,id,pagelink) |
|
elif type=="slave": |
|
entry['slave']=(fn,id,pagelink) |
|
|
|
entries=self.entries[0:] |
|
entries[-1]=entry |
|
self.entries=entries[0:] |
|
|
# 3 handler functions |
class ECHO_linkCreator(Folder): |
def start_element(name, attrs): |
"""creator for links""" |
global retLex |
|
|
|
retLex+=createTag(name,attrs) |
meta_type="ECHO_linkCreator" |
def end_element(name): |
|
global retLex |
|
if not name=="w": |
|
retLex+="</%s>"%(name.encode('utf-8')) |
|
|
|
|
|
def char_data(data): |
|
global retLex |
|
if data: |
|
try: |
|
retLex+=createData(data) |
|
except: |
|
"""no""" |
|
|
|
p = xml.parsers.expat.ParserCreate() |
|
|
|
p.StartElementHandler = start_element |
def getCollectionEntries(self,collection): |
p.EndElementHandler = end_element |
col=getattr(self,collection,None) |
p.CharacterDataHandler = char_data |
if not col: |
|
return [] |
|
|
p.Parse(lemmatized.encode('utf-8'),1) |
return col.getEntries() |
#print repr(lemmatized.encode('utf-8')) |
|
|
|
return retLex |
|
|
|
|
def getCollectionXML(self,collection=None,RESPONSE=None): |
|
"""exports the collection as an XML file""" |
|
if not collection: |
|
return "<error>no collection: need parameter collection=COLLECTION_NAME</error>" |
|
|
def lemmatize(self,nr='1',lang="de"): |
i=0 |
"""lemmatize""" |
|
global ret |
|
ret="" |
ret="" |
|
ret+="""<?xml version="1.0" encoding="UTF-8"?>""" |
|
ret+="""<linklist xmlns="http://www.mpiwg-berlin.mpg.de/namespace">""" |
|
ret+="""<linklistname>%s</linklistname>"""%collection |
|
ret+="""<masterurl ref="%s"/>"""%self.getUrls(collection)[0] |
|
ret+="""<slaveurl ref="%s"/>"""%self.getUrls(collection)[1] |
|
|
|
for entry in self.getCollectionEntries(collection): |
|
ret+="""<link id="%s">"""%i |
|
i+=1 |
|
|
|
if entry.has_key('master'): |
|
ms=entry['master'] |
|
|
def createTag(name,attrs): |
|
tag="<" |
|
tag+=name |
|
for attr in attrs.keys(): |
|
tag+=""" %s="%s" """%(attr,attrs[attr]) |
|
tag+=">" |
|
return tag |
|
|
|
def insertW(str): |
|
splitted=str.split() |
|
wordlist=["<w>%s</w>"%split for split in splitted] |
|
return string.join(wordlist,'\n') |
|
|
|
# 3 handler functions |
|
def start_element(name, attrs): |
|
global ret |
|
ret+=createTag(name,attrs) |
|
def end_element(name): |
|
global ret |
|
ret+="</%s>"%(name.encode('utf-8')) |
|
|
|
def char_data(data): |
|
global ret |
|
ret+=insertW(data) |
|
|
|
p = xml.parsers.expat.ParserCreate() |
|
|
|
p.StartElementHandler = start_element |
|
p.EndElementHandler = end_element |
|
p.CharacterDataHandler = char_data |
|
|
|
p.Parse(self.getPage(nr), 1) |
|
txt="""<wtag locator="xxx"> |
|
<section lang="%s"><s>%s</s></section> |
|
</wtag>""" |
|
ret=txt%(lang,ret) |
|
|
|
|
try: |
|
if urlparse.urlparse(ms[0])[0]=="http": # url |
|
ret+="""<source filename="%s"/>"""%urllib.quote(ms[0]) |
|
else: |
|
ret+="""<source filename="%s" refid="%s">"""%(ms[0],ms[1]) |
|
splitted=ms[2].split("/") |
|
if (len(splitted)>3): |
|
ret+="""<pagelink refid="%s" selectionNodeIndex="%s"/>"""%(splitted[0],splitted[3]) |
|
|
|
ret+="""</source>""" |
|
except: #ohne pagelink& |
|
ret+="""<source filename="%s" refid="%s"/>"""%ms |
|
if entry.has_key('slave'): |
|
ms=entry['slave'] |
|
try: |
|
if urlparse.urlparse(ms[0])[0]=="http": # url |
|
ret+="""<target filename="%s"/>"""%urllib.quote(ms[0]) |
|
else: |
|
ret+="""<target filename="%s" refid="%s">"""%(ms[0],ms[1]) |
|
splitted=ms[2].split("/") |
|
if (len(splitted)>3): |
|
ret+="""<pagelink refid="%s" selectionNodeIndex="%s"/>"""%(splitted[0],splitted[3]) |
|
ret+="""</target>""" |
|
except: #ohne pagelink |
|
ret+="""<target filename="%s" refid="%s"/>"""%ms |
|
|
|
ret+="</link>" |
|
ret+="""</linklist>""" |
|
if RESPONSE: |
|
RESPONSE.setHeader("Content-Type","text/xml") |
return ret |
return ret |
|
def index_html(self,collection=None): |
|
"""show create links""" |
|
if not collection: |
|
return "no collection: need parameter collection=COLLECTION_NAME" |
|
|
|
|
|
pt=PageTemplateFile(os.path.join(package_home(globals()),'zpt','ECHO_linkCreator_main')).__of__(self) |
|
|
|
col=getattr(self,collection,None) |
|
if not col: |
|
return [] |
|
|
|
masterUrl=getattr(col,'masterUrl','') |
|
slaveUrl=getattr(col,'slaveUrl','') |
|
|
|
return pt(collection=collection,masterUrl=masterUrl,slaveUrl=slaveUrl) |
|
|
|
def addEntry(self,collection,fn,id,type,pagelink,fromurl=None,RESPONSE=None,REQUEST=None): |
|
"""add an entry""" |
|
|
|
col=getattr(self, collection,None) |
|
if not col: |
|
self._setObject(collection,Collection(collection)) |
|
col=getattr(self, collection) |
|
|
|
col.appendEntry(fn,id,type,pagelink) |
|
|
|
if fromurl and RESPONSE: |
|
|
|
RESPONSE.setHeader("Expires",(DateTime()-1).rfc822()) |
|
RESPONSE.setHeader("Cache-Control", "no-cache") |
|
RESPONSE.redirect(fromurl) |
|
|
|
|
|
def changeEntry(self,collection,nr,slaveUrl,RESPONSE=None): |
|
"""change an entry 8only slaveUrl at the moment""" |
|
col=getattr(self, collection,None) |
|
col.changeEntry(nr,slaveUrl) |
|
|
|
if RESPONSE: |
|
RESPONSE.redirect(self.absolute_url()+"?collection="+collection) |
|
|
|
def removeEntry(self,collection,nr,RESPONSE=None): |
|
"""remove an entry""" |
|
col=getattr(self, collection,None) |
|
col.deleteEntry(nr) |
|
|
|
if RESPONSE: |
|
RESPONSE.redirect(self.absolute_url()+"?collection="+collection) |
|
|
|
def setUrls(self,collection,masterUrl,slaveUrl,RESPONSE=None): |
|
"""set the urls for the document viewer""" |
|
col=getattr(self, collection,None) |
|
setattr(col,'masterUrl',masterUrl) |
|
setattr(col,'slaveUrl',slaveUrl) |
|
|
|
if RESPONSE: |
|
RESPONSE.redirect(self.absolute_url()+"?collection="+collection) |
|
|
|
def getUrls(self,collection,RESPONSE=None): |
|
"""set the urls for the document viewer""" |
|
col=getattr(self, collection,None) |
|
x=getattr(col,'masterUrl') |
|
y=getattr(col,'slaveUrl') |
|
return x,y |
|
|
|
def manage_addECHO_linkCreatorForm(self,RESPONSE=None): |
|
"""Form for adding""" |
|
manage_addECHO_linkCreator(self,RESPONSE) |
|
|
|
def manage_addECHO_linkCreator(self,RESPONSE=None): |
|
"""Add an ECHO_main""" |
|
id='linkCreator' |
|
self._setObject(id,ECHO_linkCreator(id)) |
|
|
|
|
|
if RESPONSE is not None: |
|
RESPONSE.redirect('manage_main') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|