Mercurial > hg > anteater
diff src/classifiers/english.conll.4class.distsim.prop @ 0:036535fcd179
anteater
author | jdamerow |
---|---|
date | Fri, 14 Sep 2012 10:30:43 +0200 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/classifiers/english.conll.4class.distsim.prop Fri Sep 14 10:30:43 2012 +0200 @@ -0,0 +1,58 @@ +# This is better than Jenny's either with or without distsim turned on +# And using iob2 is better for optimal CoNLL performance. +# Features titled "chris2009" + +trainFile = /u/nlp/data/ner/goodClassifiers/data/conll.jenny.train +testFile = /u/nlp/data/ner/goodClassifiers/data/conll.jenny.testa +serializeTo = english.conll.4class.distsim.crf.ser.gz + +useDistSim = true +distSimLexicon = /u/nlp/data/pos_tags_are_useless/egw4-reut.512.clusters + +map = word=0,answer=1 + +saveFeatureIndexToDisk = true + +useTitle = true +useClassFeature=true +useWord=true +# useWordPairs=true +useNGrams=true +noMidNGrams=true +# maxNGramLeng=6 # Having them all helps, which is the default +usePrev=true +useNext=true +# useTags=true +# useWordTag=true +useLongSequences=true +useSequences=true +usePrevSequences=true +maxLeft=1 +useTypeSeqs=true +useTypeSeqs2=true +useTypeySequences=true +useOccurrencePatterns=true +useLastRealWord=true +useNextRealWord=true +#useReverse=false +normalize=true +# normalizeTimex=true +# dan2 better than chris2 on CoNLL data... +wordShape=dan2useLC +useDisjunctive=true +# disjunctionWidth 4 is better than 5 on CoNLL data +disjunctionWidth=4 +#useDisjunctiveShapeInteraction=true + +type=crf + +readerAndWriter=edu.stanford.nlp.sequences.ColumnDocumentReaderAndWriter + +useObservedSequencesOnly=true + +sigma = 20 +useQN = true +QNsize = 25 + +# makes it go faster +featureDiffThresh=0.05