annotate src/de/mpiwg/anteater/ml/preprocessing/DataCreator.java @ 0:036535fcd179

anteater
author jdamerow
date Fri, 14 Sep 2012 10:30:43 +0200
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
036535fcd179 anteater
jdamerow
parents:
diff changeset
1 package de.mpiwg.anteater.ml.preprocessing;
036535fcd179 anteater
jdamerow
parents:
diff changeset
2
036535fcd179 anteater
jdamerow
parents:
diff changeset
3 import java.io.File;
036535fcd179 anteater
jdamerow
parents:
diff changeset
4 import java.io.FileWriter;
036535fcd179 anteater
jdamerow
parents:
diff changeset
5 import java.io.IOException;
036535fcd179 anteater
jdamerow
parents:
diff changeset
6 import java.io.InputStream;
036535fcd179 anteater
jdamerow
parents:
diff changeset
7 import java.io.StringWriter;
036535fcd179 anteater
jdamerow
parents:
diff changeset
8
036535fcd179 anteater
jdamerow
parents:
diff changeset
9 import org.apache.commons.io.IOUtils;
036535fcd179 anteater
jdamerow
parents:
diff changeset
10
036535fcd179 anteater
jdamerow
parents:
diff changeset
11 import de.mpiwg.anteater.AnteaterConfiguration;
036535fcd179 anteater
jdamerow
parents:
diff changeset
12 import de.mpiwg.anteater.ml.ITextParser;
036535fcd179 anteater
jdamerow
parents:
diff changeset
13 import de.mpiwg.anteater.text.TextInformation;
036535fcd179 anteater
jdamerow
parents:
diff changeset
14
036535fcd179 anteater
jdamerow
parents:
diff changeset
15 public abstract class DataCreator {
036535fcd179 anteater
jdamerow
parents:
diff changeset
16 public final static String COMPONENT_NAME = DataCreator.class.getSimpleName();
036535fcd179 anteater
jdamerow
parents:
diff changeset
17
036535fcd179 anteater
jdamerow
parents:
diff changeset
18 public final static String UNKNOWN_CLASS_SYMBOL = "?";
036535fcd179 anteater
jdamerow
parents:
diff changeset
19
036535fcd179 anteater
jdamerow
parents:
diff changeset
20 protected AnteaterConfiguration configuration;
036535fcd179 anteater
jdamerow
parents:
diff changeset
21 private String filenamePrefix;
036535fcd179 anteater
jdamerow
parents:
diff changeset
22
036535fcd179 anteater
jdamerow
parents:
diff changeset
23 public DataCreator(AnteaterConfiguration configuration, String filenamePrefix) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
24 this.configuration = configuration;
036535fcd179 anteater
jdamerow
parents:
diff changeset
25 this.filenamePrefix = filenamePrefix;
036535fcd179 anteater
jdamerow
parents:
diff changeset
26 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
27
036535fcd179 anteater
jdamerow
parents:
diff changeset
28 public abstract void createFileContents(TextInformation info, StringBuffer arffContents, ITextParser textParser);
036535fcd179 anteater
jdamerow
parents:
diff changeset
29
036535fcd179 anteater
jdamerow
parents:
diff changeset
30
036535fcd179 anteater
jdamerow
parents:
diff changeset
31 public String createARFFFile(TextInformation info, ITextParser textParser) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
32 configuration.getLogger().logMessage(COMPONENT_NAME, "Creating location ARFF-file for " + info.getFilepath());
036535fcd179 anteater
jdamerow
parents:
diff changeset
33
036535fcd179 anteater
jdamerow
parents:
diff changeset
34
036535fcd179 anteater
jdamerow
parents:
diff changeset
35 File file = new File(info.getFilepath());
036535fcd179 anteater
jdamerow
parents:
diff changeset
36 String filename = file.getName();
036535fcd179 anteater
jdamerow
parents:
diff changeset
37 String fname = filenamePrefix + filename.substring(0, filename.lastIndexOf("."));
036535fcd179 anteater
jdamerow
parents:
diff changeset
38
036535fcd179 anteater
jdamerow
parents:
diff changeset
39 File analysisFile = new File(configuration.getMlPath() + File.separator + fname + ".arff");
036535fcd179 anteater
jdamerow
parents:
diff changeset
40 if (!analysisFile.exists()) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
41 try {
036535fcd179 anteater
jdamerow
parents:
diff changeset
42 analysisFile.createNewFile();
036535fcd179 anteater
jdamerow
parents:
diff changeset
43 } catch (IOException e) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
44 e.printStackTrace();
036535fcd179 anteater
jdamerow
parents:
diff changeset
45 return null;
036535fcd179 anteater
jdamerow
parents:
diff changeset
46 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
47 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
48 else
036535fcd179 anteater
jdamerow
parents:
diff changeset
49 return analysisFile.getAbsolutePath();
036535fcd179 anteater
jdamerow
parents:
diff changeset
50
036535fcd179 anteater
jdamerow
parents:
diff changeset
51 StringWriter writer = new StringWriter();
036535fcd179 anteater
jdamerow
parents:
diff changeset
52 InputStream stream = getClass().getResourceAsStream("template.arff");
036535fcd179 anteater
jdamerow
parents:
diff changeset
53 try {
036535fcd179 anteater
jdamerow
parents:
diff changeset
54 IOUtils.copy(stream, writer);
036535fcd179 anteater
jdamerow
parents:
diff changeset
55 } catch (IOException e) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
56 e.printStackTrace();
036535fcd179 anteater
jdamerow
parents:
diff changeset
57 return null;
036535fcd179 anteater
jdamerow
parents:
diff changeset
58 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
59
036535fcd179 anteater
jdamerow
parents:
diff changeset
60 StringBuffer arffContents = writer.getBuffer();
036535fcd179 anteater
jdamerow
parents:
diff changeset
61
036535fcd179 anteater
jdamerow
parents:
diff changeset
62
036535fcd179 anteater
jdamerow
parents:
diff changeset
63 createFileContents(info, arffContents, textParser);
036535fcd179 anteater
jdamerow
parents:
diff changeset
64
036535fcd179 anteater
jdamerow
parents:
diff changeset
65
036535fcd179 anteater
jdamerow
parents:
diff changeset
66 try {
036535fcd179 anteater
jdamerow
parents:
diff changeset
67 FileWriter filewriter = new FileWriter(analysisFile);
036535fcd179 anteater
jdamerow
parents:
diff changeset
68 filewriter.write(arffContents.toString());
036535fcd179 anteater
jdamerow
parents:
diff changeset
69 filewriter.flush();
036535fcd179 anteater
jdamerow
parents:
diff changeset
70 filewriter.close();
036535fcd179 anteater
jdamerow
parents:
diff changeset
71 } catch (IOException e) {
036535fcd179 anteater
jdamerow
parents:
diff changeset
72 // TODO Auto-generated catch block
036535fcd179 anteater
jdamerow
parents:
diff changeset
73 e.printStackTrace();
036535fcd179 anteater
jdamerow
parents:
diff changeset
74 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
75
036535fcd179 anteater
jdamerow
parents:
diff changeset
76 return analysisFile.getAbsolutePath();
036535fcd179 anteater
jdamerow
parents:
diff changeset
77 }
036535fcd179 anteater
jdamerow
parents:
diff changeset
78 }