cdli/cdliSplitter.py - diff

Return to cdliSplitter.py CVS log

Up to [Repository] / cdli

Diff for /cdli/cdliSplitter.py between versions 1.4 and 1.5

version 1.4, 2007/02/08 12:00:23	version 1.5, 2007/03/21 19:29:23
Line 10 from Products.ZCTextIndex.PipelineFactor	Line 10 from Products.ZCTextIndex.PipelineFactor

import re	import re
from types import StringType	from types import StringType
	import logging

	import PyLucene

def getSupportedEncoding(encodings):	def getSupportedEncoding(encodings):
for encoding in encodings:	for encoding in encodings:
Line 28 def getSupportedEncoding(encodings):	Line 31 def getSupportedEncoding(encodings):
ignoreLines=['$','@','#','&']	ignoreLines=['$','@','#','&']
separators=['']	separators=['']
komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted	komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted
delete="\{\|\}\|<\|>\|$\|$\|-\|_\|\#\|,\|\\|\|\]\|\[\|\!\|\?" # for graphems	deleteGraphems="\{\|\}\|<\|>\|$\|$\|-\|_\|\#\|,\|\\|\|\]\|\[\|\!\|\?" # for graphems
#delete="<\|>\|$\|$\|_\|\#\|,\|\\|\|\]\|\[\|!\|?" for words	deleteWords="<\|>\|$\|$\|_\|\#\|,\|\\|\|\]\|\[\|\!\|\?"# for words

	class IndexLine(object):
	"""index a line with lucene"""

class graphemeSplitter:	def __init__(self, storeDir, analyzer,name,line,content):
	logging.error("i am here %s %s %s %s %s"%((storeDir, analyzer,name,line,content)))
	if not os.path.exists(storeDir):
	os.mkdir(storeDir)
	store = PyLucene.FSDirectory.getDirectory(storeDir, True)
	writer = PyLucene.IndexWriter(store, analyzer, True)
	writer.setMaxFieldLength(1048576)
	self.indexDocs(writer,name,line,content)
	writer.optimize()
	writer.close()

	def indexDocs(self, writer,name,line,content):

	doc = PyLucene.Document()
	doc.add(PyLucene.Field("name", pn,
	PyLucene.Field.Store.YES,
	PyLucene.Field.Index.UN_TOKENIZED))

	doc.add(PyLucene.Field("line", str(i),
	PyLucene.Field.Store.YES,
	PyLucene.Field.Index.UN_TOKENIZED))


	doc.add(PyLucene.Field("contents", line,
	PyLucene.Field.Store.YES,
	PyLucene.Field.Index.TOKENIZED))

	writer.addDocument(doc)

	class cdliSplitter:
	"""basis class for splitter,
	der Unterschied zwischen Word und Graphemesplitter
	ist lediglich die unterschiedliche Auschlie§ungsliste"""

default_encoding = "utf-8"	default_encoding = "utf-8"
	delete=deleteGraphems
	indexName="cdliSplitter"


def process(self, lst):	def process(self, lst):
result = []	result = []
pNum=None	pNum=None
lineNum=None	lineNum=None


#print "LLLL",lst


for t in lst:	for t in lst:

t.replace("\r","\n")	t.replace("\r","\n")
Line 52 class graphemeSplitter:	Line 89 class graphemeSplitter:
if type(s) is StringType: # not unicode	if type(s) is StringType: # not unicode
s = unicode(s, self.default_encoding, 'replace')	s = unicode(s, self.default_encoding, 'replace')

#ignore lines

if (s!="") and (s[0]=="&"): # store pNum	if (s!="") and (s[0]=="&"): # store pNum
pNum=s[1:8]	pNum=s[1:8]
	logging.debug("storing: %s"%pNum)
elif (s!="") and (not (s[0] in ignoreLines)):	elif (s!="") and (not (s[0] in ignoreLines)):


#ignore everthing bevor "."
splitted=s.split(".")	splitted=s.split(".")

if len(splitted)==1: #kein punkt	if len(splitted)==1: #kein punkt
Line 70 class graphemeSplitter:	Line 102 class graphemeSplitter:
lineNum=splitted[0] #store line number	lineNum=splitted[0] #store line number

analyse=txt	analyse=txt

analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems	analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems
	analyse=re.sub(self.delete,' ',analyse) # deletions

analyse=re.sub(delete,' ',analyse) # deletions	if self.indexName=="luceneSplitter":
	if pNum:
	analyser=PyLucene.StandardAnalyzer()
	logging.error("calling lucene")

	IndexLine("/tmp/index",analyser,pNum,lineNum,analyse)
	else:
splitted = analyse.split(" ")	splitted = analyse.split(" ")


for w in splitted:	for w in splitted:
w=w.lstrip().rstrip()	w=w.lstrip().rstrip()

if not (w==''):	if not (w==''):
if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline	if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline
Zope.app().cdliRoot.storeInLineIndex(w.lstrip().strip(),(pNum,lineNum))
	Zope2.app().cdliRoot.storeInLineIndex(self.indexName,w.lstrip().strip(),(pNum,lineNum))
transaction.get().commit()	transaction.get().commit()

result.append(w.lstrip().rstrip())	result.append(w.lstrip().rstrip())
return result	return result


	class graphemeSplitter(cdliSplitter):
	delete=deleteGraphems
	indexName="graphemeSplitter"

	class wordSplitter(cdliSplitter):
	delete=deleteWords
	indexName="wordSplitter"

	class luceneSplitter(cdliSplitter):
	delete=deleteWords
	indexName="luceneSplitter"


try:	try:
element_factory.registerFactory('Word Splitter',	element_factory.registerFactory('Word Splitter',
'CDLI grapheme splitter', graphemeSplitter)	'CDLI grapheme splitter', graphemeSplitter)
Line 96 except:	Line 148 except:
# in case the splitter is already registered, ValueError is raised	# in case the splitter is already registered, ValueError is raised
pass	pass

	try:
	element_factory.registerFactory('Word Splitter',
	'CDLI word splitter', wordSplitter)
	except:
	# in case the splitter is already registered, ValueError is raised
	pass

	try:
	element_factory.registerFactory('Word Splitter',
	'CDLI lucene splitter', luceneSplitter)
	except:
	# in case the splitter is already registered, ValueError is raised
	pass
if __name__ == '__main__':	if __name__ == '__main__':
a = 'abc defÎÒÃÇµÄºÜ ºÃ¡£'	a = 'abc defÎÒÃÇµÄºÜ ºÃ¡£'
u = unicode(a, 'gbk')	u = unicode(a, 'gbk')

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>

Removed from v.1.4
changed lines
	Added in v.1.5