|
|
| version 1.7.2.6, 2007/12/03 21:30:19 | version 1.7.2.12, 2008/01/14 17:35:26 |
|---|---|
| Line 25 ignoreLines=['$','@','#','&','>'] | Line 25 ignoreLines=['$','@','#','&','>'] |
| separators=[''] | separators=[''] |
| # kommas relevant for graphemes will not be deleted | # kommas relevant for graphemes will not be deleted |
| komma_exception="([^sStThH])," | komma_exception="([^sStThH])," |
| komma_exceptionex=re.compile(komma_exception) | |
| # grapheme boundaries | # grapheme boundaries |
| #graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" | #graphemeBounds="\{|\}|<|>|\(|\)|-|_|\#|,|\||\]|\[|\!|\?" |
| graphemeBounds="\{|\}|<|>|-|_|\#|,|\]|\[|\!|\?|\"" | graphemeBounds="\{|\}|<|>|-|_|\#|,|\]|\[|\!|\?|\"" |
| graphemeIgnore="<|>|\#|\||\]|\[|\!|\?\*|;" | |
| # for words | # for words |
| #wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?" | #wordBounds="<|>|\(|\)|_|\#|,|\||\]|\[|\!|\?" |
| wordBounds="<|>|_|\#|,|\]|\[|\!|\?|\"" | wordBounds="_|,|\"" |
| wordIgnore="<|>|\#|\||\]|\[|\!|\?\*|;" | |
| class cdliSplitter: | class cdliSplitter: |
| """base class for splitter. | """base class for splitter. |
| Line 40 class cdliSplitter: | Line 42 class cdliSplitter: |
| default_encoding = "utf-8" | default_encoding = "utf-8" |
| bounds=graphemeBounds | bounds=graphemeBounds |
| boundsex=re.compile(graphemeBounds) | |
| ignore=graphemeIgnore | |
| ignorex=re.compile(graphemeIgnore) | |
| indexName="cdliSplitter" | indexName="cdliSplitter" |
| Line 68 class cdliSplitter: | Line 73 class cdliSplitter: |
| elif not (s[0] in ignoreLines): | elif not (s[0] in ignoreLines): |
| # regular line | # regular line |
| lineparts=s.split(".") | lineparts=s.split(". ",1) |
| if len(lineparts)==1: | if len(lineparts)==1: |
| # no line number | # no line number |
| txt=s | txt=s |
| Line 78 class cdliSplitter: | Line 83 class cdliSplitter: |
| lineNum=lineparts[0] | lineNum=lineparts[0] |
| # delete kommata except kommata relevant for graphemes | # delete kommata except kommata relevant for graphemes |
| txt = re.sub(komma_exception,r"\1",txt) | txt = komma_exceptionex.sub(r"\1",txt) |
| # replace word boundaries by spaces | # replace word boundaries by spaces |
| txt = re.sub(self.bounds,' ',txt) | txt = self.boundsex.sub(' ',txt) |
| # replace letters to be ignored | |
| txt = self.ignorex.sub('',txt) | |
| # split words | # split words |
| words = txt.split(" ") | words = txt.split(" ") |
| for w in words: | for w in words: |
| Line 94 class cdliSplitter: | Line 101 class cdliSplitter: |
| class graphemeSplitter(cdliSplitter): | class graphemeSplitter(cdliSplitter): |
| bounds=graphemeBounds | bounds=graphemeBounds |
| boundsex=re.compile(graphemeBounds) | |
| ignore=graphemeIgnore | |
| ignorex=re.compile(graphemeIgnore) | |
| indexName="graphemeSplitter" | indexName="graphemeSplitter" |
| class wordSplitter(cdliSplitter): | class wordSplitter(cdliSplitter): |
| bounds=wordBounds | bounds=wordBounds |
| boundsex=re.compile(wordBounds) | |
| ignore=wordIgnore | |
| ignorex=re.compile(wordIgnore) | |
| indexName="wordSplitter" | indexName="wordSplitter" |
| try: | try: |