/** * */ package oboannotator.application; import java.io.BufferedInputStream; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import oboannotator.typesys.MedlineDocument; import oboannotator.typesys.MedlineField; import oboannotator.util.AppConfig; import oboannotator.util.Util; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.CAS; import org.apache.uima.cas.CASException; import org.apache.uima.cas.FSIndex; import org.apache.uima.conceptMapper.DictTerm; import org.apache.uima.examples.opennlp.Token; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.ResourceSpecifier; import org.apache.uima.util.FileUtils; import org.apache.uima.util.XMLInputSource; /** * @author ahmedabdeenhamed * */ public class OBO_DocumentAnalyzer { public static void annotateDocuments(String userCusmtomedAE, String readFromDirectory) { try { File taeDescriptor = null; File inputDir = null; // Read and validate command line arguments boolean validArgs = false; taeDescriptor = new File(userCusmtomedAE); inputDir = new File(readFromDirectory); validArgs = taeDescriptor.exists() && !taeDescriptor.isDirectory() && inputDir.isDirectory(); if (!validArgs) { printUsageMessage(); } else { // get Resource Specifier from XML file XMLInputSource in = new XMLInputSource(taeDescriptor); ResourceSpecifier specifier = UIMAFramework.getXMLParser() .parseResourceSpecifier(in); // for debugging, output the Resource Specifier // System.out.println(specifier); // create Analysis Engine AnalysisEngine ae = UIMAFramework .produceAnalysisEngine(specifier); // create a CAS CAS cas = ae.newCAS(); // get all files in the input directory File[] files = inputDir.listFiles(); if (files == null) { System.out.println("No files to process"); } else { // process documents for (int i = 0; i < files.length; i++) { if (!files[i].isDirectory()) { processFile(files[i], ae, cas); } } } ae.destroy(); } } catch (Exception e) { e.printStackTrace(); } } public static void processFile(File aFile, AnalysisEngine anAE, CAS aCAS) throws IOException, AnalysisEngineProcessException { BufferedInputStream fis = null; StringBuilder builder = new StringBuilder(); ArrayList listOfNouns = new ArrayList(); try { String document = FileUtils.file2String(aFile); document = document.trim(); aCAS.getTypeSystem(); // put document text in CAS aCAS.setDocumentText(document); // process anAE.process(aCAS); JCas aJCas = aCAS.getJCas(); // get annotation indexes FSIndex mlDocIndex = aJCas.getAnnotationIndex(MedlineDocument.type); StringBuilder documentBuilder = new StringBuilder(); // iterate over all combinations Iterator mlDocIter = mlDocIndex.iterator(); while (mlDocIter.hasNext()) { MedlineDocument mlDoc = (MedlineDocument) mlDocIter.next(); //System.out.println("----------------------------------------"); //System.out.println("Doc Begins ..."); String mlDocText = mlDoc.getText(); //------------------------------------------------------------------------------------------------------------------- // create a CAS CAS mlDocCas = anAE.newCAS(); // put document text in CAS mlDocCas.getTypeSystem(); mlDocCas.setDocumentText(mlDocText + "\n\nEND -" ); //System.out.println("Doc Text: \n" + mlDocCas.getDocumentText() + "\n"); // process anAE.process(mlDocCas); JCas mlDocJCas = mlDocCas.getJCas(); FSIndex mlFieldIndex = mlDocJCas.getAnnotationIndex(MedlineField.type); //System.out.println(" ML Field Index : " + mlFieldIndex.size()); Iterator mlFieldIter = mlFieldIndex.iterator(); String documentID = null; while (mlFieldIter.hasNext()) { //count++; MedlineField mlField = (MedlineField) mlFieldIter.next(); if("UI".equals(mlField.getName())){ documentID = mlField.getText().trim(); } //MedlineField mlABField = (MedlineField) mlFieldIter.next(); if("AB".equals(mlField.getName())){ //------------------------------------------------------------------------------------------------------------------- // get the pos tagger tokens and be done with it. CAS posTaggerCas= anAE.newCAS(); posTaggerCas.getTypeSystem(); posTaggerCas.setDocumentText(mlField.getText()); // process anAE.process(posTaggerCas); JCas posTaggerJCas = posTaggerCas.getJCas(); FSIndex tokenIndex = posTaggerJCas.getAnnotationIndex(Token.type); Iterator posTokenIter = tokenIndex.iterator(); //http://flake.cs.uiuc.edu/~rizzolo/LBJ2/library/LBJ2/nlp/POS.html while(posTokenIter.hasNext()){ Token aToken = (Token)posTokenIter.next(); if (aToken.getPosTag().equals("NN") || aToken.getPosTag().equals("NNP") || aToken.getPosTag().equals("NNPS")||aToken.getPosTag().equals("NNS")){ listOfNouns.add(aToken.getCoveredText()); } } // create a CAS CAS mlFieldCas = anAE.newCAS(); // put document text in CAS mlFieldCas.getTypeSystem(); // put document text in CAS mlFieldCas.setDocumentText(mlField.getText()); // process anAE.process(mlFieldCas); JCas mlJCas = mlFieldCas.getJCas(); FSIndex mlDictTermIndex = mlJCas.getAnnotationIndex(DictTerm.type); Iterator mlDictTermIter = mlDictTermIndex.iterator(); while (mlDictTermIter.hasNext()) { DictTerm dictTerm = (DictTerm) mlDictTermIter.next(); if (listOfNouns.contains(dictTerm.getCoveredText())){ if(documentID!=null){ builder.append(documentID + "|"); } String[] splits = Util.splitOBOTerm(dictTerm.getDictCanon().trim()); builder.append(dictTerm.getCoveredText().trim() + "|" + splits[0] + "|" + splits[1] + "|" + splits[2] + "|"); builder.append(dictTerm.getBegin()); documentBuilder.append(builder.toString().trim() + "\n"); builder = new StringBuilder(); } } } } } Util.writeStringToFile(documentBuilder.toString(), AppConfig.OUTPUT_DIR, AppConfig.OUTPUT_FILE); //System.out.println(documentBuilder.toString()); // reset the CAS to prepare it for processing the next document aCAS.reset(); } catch (CASException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (ResourceInitializationException e) { // TODO Auto-generated catch block e.printStackTrace(); } finally { try { if (fis != null) fis.close(); } catch (Exception ex) { ex.printStackTrace(); } } } private static void printUsageMessage() { System.err.println("Usage: oboannotator.applications " + " "); } /** * @param args */ public static void main(String[] args) { OBO_DocumentAnalyzer.annotateDocuments(AppConfig.AGGREGATE_AE_DIR+AppConfig.USER_ASSEMBLED_AGGREGATE_AE, AppConfig.INPUT_FILES_DIR); } }