cdli/cdliSplitter.py - diff

Return to cdliSplitter.py CVS log

Up to [Repository] / cdli

Diff for /cdli/cdliSplitter.py between versions 1.3 and 1.7

version 1.3, 2007/01/24 18:05:24	version 1.7, 2007/08/31 14:22:52
Line 2	Line 2
Author splitter	Author splitter
"""	"""

import Zope	import Zope2
import transaction	import transaction

from Products.ZCTextIndex.ISplitter import ISplitter	from Products.ZCTextIndex.ISplitter import ISplitter
Line 10 from Products.ZCTextIndex.PipelineFactor	Line 10 from Products.ZCTextIndex.PipelineFactor

import re	import re
from types import StringType	from types import StringType
	import logging
	try:
	import PyLucene
	except:
	print "no Lucene support"

def getSupportedEncoding(encodings):	def getSupportedEncoding(encodings):
for encoding in encodings:	for encoding in encodings:
Line 25 def getSupportedEncoding(encodings):	Line 30 def getSupportedEncoding(encodings):
"""beta of a fulltext splitter for cdli	"""beta of a fulltext splitter for cdli

"""	"""
ignoreLines=['$','@','#','&']	ignoreLines=['$','@','#','&','>']
separators=['']	separators=['']
komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted	komma_exception="([^sStThH])," # komma relevant for graphemes will not be deleted
delete="\{\|\}\|<\|>\|$\|$\|-\|_\|\#\|,\|\\|\|\]\|\[\|\!\|\?" # for graphems	deleteGraphems="\{\|\}\|<\|>\|$\|$\|-\|_\|\#\|,\|\\|\|\]\|\[\|\!\|\?" # for graphems
#delete="<\|>\|$\|$\|_\|\#\|,\|\\|\|\]\|\[\|!\|?" for words	deleteWords="<\|>\|$\|$\|_\|\#\|,\|\\|\|\]\|\[\|\!\|\?"# for words

class graphemeSplitter:	class IndexLine(object):
	"""index a line with lucene"""

	def __init__(self, storeDir, analyzer,name,line,content):
	logging.error("i am here %s %s %s %s %s"%((storeDir, analyzer,name,line,content)))
	if not os.path.exists(storeDir):
	os.mkdir(storeDir)
	store = PyLucene.FSDirectory.getDirectory(storeDir, True)
	writer = PyLucene.IndexWriter(store, analyzer, True)
	writer.setMaxFieldLength(1048576)
	self.indexDocs(writer,name,line,content)
	writer.optimize()
	writer.close()

	def indexDocs(self, writer,name,line,content):

	doc = PyLucene.Document()
	doc.add(PyLucene.Field("name", pn,
	PyLucene.Field.Store.YES,
	PyLucene.Field.Index.UN_TOKENIZED))

	doc.add(PyLucene.Field("line", str(i),
	PyLucene.Field.Store.YES,
	PyLucene.Field.Index.UN_TOKENIZED))


	doc.add(PyLucene.Field("contents", line,
	PyLucene.Field.Store.YES,
	PyLucene.Field.Index.TOKENIZED))

	writer.addDocument(doc)

	class cdliSplitter:
	"""basis class for splitter,
	der Unterschied zwischen Word und Graphemesplitter
	ist lediglich die unterschiedliche Auschlie§ungsliste"""

default_encoding = "utf-8"	default_encoding = "utf-8"
	delete=deleteGraphems
	indexName="cdliSplitter"


def process(self, lst):	def process(self, lst):
result = []	result = []
pNum=None	pNum=None
lineNum=None	lineNum=None


#print "LLLL",lst


for t in lst:	for t in lst:

t.replace("\r","\n")	t.replace("\r","\n")
Line 52 class graphemeSplitter:	Line 91 class graphemeSplitter:
if type(s) is StringType: # not unicode	if type(s) is StringType: # not unicode
s = unicode(s, self.default_encoding, 'replace')	s = unicode(s, self.default_encoding, 'replace')

#ignore lines

if (s!="") and (s[0]=="&"): # store pNum	if (s!="") and (s[0]=="&"): # store pNum
pNum=s[1:8]	pNum=s[1:8]
	logging.debug("storing: %s"%pNum)
elif (s!="") and (not (s[0] in ignoreLines)):	elif (s!="") and (not (s[0] in ignoreLines)):


#ignore everthing bevor "."
splitted=s.split(".")	splitted=s.split(".")

if len(splitted)==1: #kein punkt	if len(splitted)==1: #kein punkt
Line 70 class graphemeSplitter:	Line 104 class graphemeSplitter:
lineNum=splitted[0] #store line number	lineNum=splitted[0] #store line number

analyse=txt	analyse=txt

analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems	analyse=re.sub(komma_exception,r"\1",analyse) # delete kommata except kommata relevant in graphems
	analyse=re.sub(self.delete,' ',analyse) # deletions

analyse=re.sub(delete,' ',analyse) # deletions	if self.indexName=="luceneSplitter":
	if pNum:
	analyser=PyLucene.StandardAnalyzer()
	logging.error("calling lucene")

	IndexLine("/tmp/index",analyser,pNum,lineNum,analyse)
	else:
splitted = analyse.split(" ")	splitted = analyse.split(" ")


for w in splitted:	for w in splitted:
w=w.lstrip().rstrip()	w=w.lstrip().rstrip()

if not (w==''):	if not (w==''):
if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline	if pNum: #only whe pnum is found (first call of the splitter, is always called twice in the pipeline
Zope.app().cdliRoot.storeInLineIndex(w.lstrip().strip(),(pNum,lineNum))
	Zope2.app().cdliRoot.storeInLineIndex(self.indexName,w.lstrip().strip(),(pNum,lineNum))
transaction.get().commit()	transaction.get().commit()

result.append(w.lstrip().rstrip())	result.append(w.lstrip().rstrip())
return result	return result


	class graphemeSplitter(cdliSplitter):
	delete=deleteGraphems
	indexName="graphemeSplitter"

	class wordSplitter(cdliSplitter):
	delete=deleteWords
	indexName="wordSplitter"

	class luceneSplitter(cdliSplitter):
	delete=deleteWords
	indexName="luceneSplitter"


try:	try:
element_factory.registerFactory('Word Splitter',	element_factory.registerFactory('Word Splitter',
'CDLI grapheme splitter', graphemeSplitter)	'CDLI grapheme splitter', graphemeSplitter)
Line 96 except:	Line 150 except:
# in case the splitter is already registered, ValueError is raised	# in case the splitter is already registered, ValueError is raised
pass	pass

	try:
	element_factory.registerFactory('Word Splitter',
	'CDLI word splitter', wordSplitter)
	except:
	# in case the splitter is already registered, ValueError is raised
	pass

	try:
	element_factory.registerFactory('Word Splitter',
	'CDLI lucene splitter', luceneSplitter)
	except:
	# in case the splitter is already registered, ValueError is raised
	pass
if __name__ == '__main__':	if __name__ == '__main__':
a = 'abc defÎÒÃÇµÄºÜ ºÃ¡£'	a = 'abc defÎÒÃÇµÄºÜ ºÃ¡£'
u = unicode(a, 'gbk')	u = unicode(a, 'gbk')

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>

Removed from v.1.3
changed lines
	Added in v.1.7