view src/main/java/edu/harvard/iq/dataverse/datavariable/DataTableImportDDI.java @ 10:a50cf11e5178

Rewrite LGDataverse completely upgrading to dataverse4.0
author Zoe Hong <zhong@mpiwg-berlin.mpg.de>
date Tue, 08 Sep 2015 17:00:21 +0200
parents
children
line wrap: on
line source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */

package edu.harvard.iq.dataverse.datavariable;

import edu.harvard.iq.dataverse.DataTable;
import edu.harvard.iq.dataverse.util.StringUtil;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;

/**
 *
 * @author Leonid Andreev
 */
public class DataTableImportDDI {

    public static final String VAR_INTERVAL_DISCRETE = "discrete";
    public static final String VAR_INTERVAL_CONTIN = "contin";
    public static final String VAR_INTERVAL_NOMINAL = "nominal";
    public static final String VAR_INTERVAL_DICHOTOMOUS = "dichotomous";
    
    public static final String VAR_TYPE_NUMERIC = "numeric";
    public static final String VAR_TYPE_CHARACTER = "character";


    public static final String VAR_WEIGHTED = "wgtd";

    public static final String LEVEL_VARIABLE = "variable";
    public static final String LEVEL_CATEGORY = "category";
    public static final String CAT_STAT_TYPE_FREQUENCY = "freq";


    
    public static final String NOTE_TYPE_UNF = "VDC:UNF";

    // Method processDataDscr takes XMLStreamReader xmlr that has just 
    // encountered the DDI tag <dataDscr>, processes all the variables and 
    // returns a Map of DataTables mapped by the strings found in the 
    // "location" attributes of the variables. The DataTables from the 
    // Map will need to be linked to the corresponding DataFiles by these 
    // file ids. The DataVariable objects found in this dataDscr section 
    // have already been linked to the corresponding DataTables in the Map. 
    // -- L.A. 4.0 beta 9
    
    private Map<String, DataTable> processDataDscr(XMLStreamReader xmlr) throws XMLStreamException {
        Map<String, DataTable> dataTablesMap = new HashMap<>();
        Map<String, Integer> varsPerFileMap = new HashMap<>();

        for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
            if (event == XMLStreamConstants.START_ELEMENT) {
                if (xmlr.getLocalName().equals("var")) {
                    processVar(xmlr, dataTablesMap, varsPerFileMap);
                }
            } else if (event == XMLStreamConstants.END_ELEMENT) {
                if (xmlr.getLocalName().equals("dataDscr")) {

                    for (Object fileId : dataTablesMap.keySet()) {
                        Integer numberOfVariables = (Integer) varsPerFileMap.get(fileId);
                        if (numberOfVariables != null && numberOfVariables.intValue() > 0) {
                            // OK, this looks like we have found variables for this
                            // data file entry.
                        } else {
                            // TODO:
                            //  otherwise, the studyfile needs to be converted
                            //  from TabularFile to OtherFile; i.e., it should
                            //  be treated as non-subsettable, if there are
                            //  no variables in the <dataDscr> section of the
                            //  DDI referencing the file.
                            //  This actually happens in real life. For example,
                            //  Roper puts some of their files into the <fileDscr>
                            //  section, even though there's no <dataDscr>
                            //  provided for them.
                            //      -- L.A. 
                            // TODO: confirm that this works under 4.0 as is
                            // -- L.A. 4.0 beta 9
                        }
                    }

                    return dataTablesMap;
                }
            }
        }
        return null;
    }

    private void processVar(XMLStreamReader xmlr, Map dataTablesMap, Map varsPerFileMap) throws XMLStreamException {
        DataVariable dv = new DataVariable();
        dv.setInvalidRanges(new ArrayList());
        dv.setSummaryStatistics( new ArrayList() );
        dv.setCategories( new ArrayList() );
        dv.setName( xmlr.getAttributeValue(null, "name") );

        try {
            dv.setNumberOfDecimalPoints( new Long( xmlr.getAttributeValue(null, "dcml") ) );
        } catch (NumberFormatException nfe) {}

        // interval type (DB value may be different than DDI value)
        String _interval = xmlr.getAttributeValue(null, "intrvl");
        
        if (VAR_INTERVAL_CONTIN.equals(_interval)) {
            dv.setIntervalContinuous();
        } else if (VAR_INTERVAL_NOMINAL.equals(_interval)) {
            dv.setIntervalNominal();
        } else if (VAR_INTERVAL_DICHOTOMOUS.equals(_interval)) {
            dv.setIntervalDichotomous();
        } else {
            // default is discrete
            dv.setIntervalDiscrete();
        }

        dv.setWeighted( VAR_WEIGHTED.equals( xmlr.getAttributeValue(null, "wgt") ) );
        // default is not-wgtd, so null sets weighted to false


        for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
            if (event == XMLStreamConstants.START_ELEMENT) {
                if (xmlr.getLocalName().equals("location")) {
                    processLocation(xmlr, dv, dataTablesMap, varsPerFileMap);
                }
                else if (xmlr.getLocalName().equals("labl")) {
                    String _labl = processLabl( xmlr, LEVEL_VARIABLE );
                    if (_labl != null && !_labl.equals("") ) {
                        dv.setLabel( _labl );
                    }
                } else if (xmlr.getLocalName().equals("universe")) {
                    dv.setUniverse( parseText(xmlr) );
                } else if (xmlr.getLocalName().equals("invalrng")) {
                    processInvalrng( xmlr, dv );
                } else if (xmlr.getLocalName().equals("varFormat")) {
                    processVarFormat( xmlr, dv );
                } else if (xmlr.getLocalName().equals("sumStat")) {
                    processSumStat( xmlr, dv );
                } else if (xmlr.getLocalName().equals("catgry")) {
                    processCatgry( xmlr, dv );
                } else if (xmlr.getLocalName().equals("notes")) {
                    String _note = parseNoteByType( xmlr, NOTE_TYPE_UNF );
                    if (_note != null && !_note.equals("") ) {
                        dv.setUnf( parseUNF( _note ) );
                    }
                }

            } else if (event == XMLStreamConstants.END_ELEMENT) {
                if (xmlr.getLocalName().equals("var")) return;
            }
        }
    }


    private void processLocation(XMLStreamReader xmlr, DataVariable dv, Map dataTablesMap, Map varsPerFileMap) throws XMLStreamException {

        // fileStartPos, FileEndPos, and RecSegNo
        // if these fields don't convert to Long, just leave blank
        try {
            dv.setFileStartPosition( new Long( xmlr.getAttributeValue(null, "StartPos") ) );
        } catch (NumberFormatException ex) {}
        try {
            dv.setFileEndPosition( new Long( xmlr.getAttributeValue(null, "EndPos") ) );
        } catch (NumberFormatException ex) {}
        try {
            dv.setRecordSegmentNumber( new Long( xmlr.getAttributeValue(null, "RecSegNo") ) );
        } catch (NumberFormatException ex) {}


        if (dv.getDataTable() == null) {
            String fileId = xmlr.getAttributeValue(null, "fileid");

            if (fileId != null && !fileId.equals("")) {

                DataTable datatable = null;

                if (dataTablesMap.get(fileId) != null) {
                    datatable = (DataTable) dataTablesMap.get(fileId);
                } else {
                    datatable = new DataTable();
                    dataTablesMap.put(fileId, datatable);
                    varsPerFileMap.put(fileId, new Integer(0));
                }

                dv.setDataTable(datatable);
                if (datatable.getDataVariables() == null) {
                    datatable.setDataVariables(new ArrayList<DataVariable>());
                }
                datatable.getDataVariables().add(dv);

                int filePosition = ((Integer)varsPerFileMap.get(fileId)).intValue();
                dv.setFileOrder(filePosition++);
                varsPerFileMap.put(fileId, new Integer(filePosition));                
            }
        } else {
            throw new XMLStreamException("Empty or NULL location attribute in a variable section.");
        }
    }


    private void processInvalrng(XMLStreamReader xmlr, DataVariable dv) throws XMLStreamException {
        for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
            if (event == XMLStreamConstants.START_ELEMENT) {
                if (xmlr.getLocalName().equals("item")) {
                    VariableRange range = new VariableRange();
                    dv.getInvalidRanges().add(range);
                    range.setDataVariable(dv);

                    range.setBeginValue( xmlr.getAttributeValue(null, "VALUE") );
                    range.setBeginValueTypePoint();
                } else if (xmlr.getLocalName().equals("range")) {
                    VariableRange range = new VariableRange();
                    dv.getInvalidRanges().add(range);
                    range.setDataVariable(dv);

                    String min = xmlr.getAttributeValue(null, "min");
                    String minExclsuive = xmlr.getAttributeValue(null, "minExclusive");
                    String max = xmlr.getAttributeValue(null, "max");
                    String maxExclusive = xmlr.getAttributeValue(null, "maxExclusive");

                    if ( !StringUtil.isEmpty(min) ) {
                        range.setBeginValue( min );
                        range.setBeginValueTypeMin( );
                    } else if ( !StringUtil.isEmpty(minExclsuive) ) {
                        range.setBeginValue( minExclsuive );
                        range.setBeginValueTypeMinExcl();
                    }

                    if ( !StringUtil.isEmpty(max) ) {
                        range.setEndValue( max );
                        range.setEndValueTypeMax();
                    } else if ( !StringUtil.isEmpty(maxExclusive) ) {
                        range.setEndValue( maxExclusive );
                        range.setEndValueTypeMaxExcl();
                    }
                }
            } else if (event == XMLStreamConstants.END_ELEMENT) {
                if (xmlr.getLocalName().equals("invalrng")) return;
            }
        }
    }

    private void processVarFormat(XMLStreamReader xmlr, DataVariable dv) throws XMLStreamException {
        String type = xmlr.getAttributeValue(null, "type");
        type = (type == null ? VAR_TYPE_NUMERIC : type); 

        if (VAR_TYPE_CHARACTER.equals(type)) {
            dv.setTypeCharacter();
        } else {
            dv.setTypeNumeric(); // default is numeric
        }
        
        dv.setFormat( xmlr.getAttributeValue(null, "formatname") );
        
        String varFormatCategoryAtt = xmlr.getAttributeValue(null, "category");
        String varFormatText = parseText(xmlr); 

        
        
        /* 
         * A somewhat hackish way of recognizing "boolean" variables; 
         * This is not a universally accepted convention - we (the DVN team)
         * simply decided to handle it this way. Booleans are treated simply 
         * as categorical variables with integers 0 and 1 for the values, and 
         * "FALSE" and "TRUE" for the labels. On top of that, we make a note
         * of the variable's "booleanness", in the DDI, like this:
         *      <varFormat ...>Boolean</varFormat>
         * and in the database, by setting the value of dv.formatCategory to 
         * "Boolean". 
         * This information isn't used much in the application (as of May, 2013), 
         * except in the subsetting: when the column is subset and re-imported
         * into an R data frame, we'll convert it into a logical vector.
         * TODO: 
         * Add this to the export end! --L.A. 
         */
        
        if ("Boolean".equalsIgnoreCase(varFormatText)) {
            dv.setFormatCategory( "Boolean" );
        } else {
            dv.setFormatCategory( varFormatCategoryAtt );
        }
    }

    private void processSumStat(XMLStreamReader xmlr, DataVariable dv) throws XMLStreamException {
        SummaryStatistic ss = new SummaryStatistic();
        ss.setTypeByLabel(xmlr.getAttributeValue(null, "type"));
        ss.setValue( parseText(xmlr)) ;
        ss.setDataVariable(dv);
        dv.getSummaryStatistics().add(ss);
    }

    private void processCatgry(XMLStreamReader xmlr, DataVariable dv) throws XMLStreamException {
        VariableCategory cat = new VariableCategory();
        cat.setMissing( "Y".equals( xmlr.getAttributeValue(null, "missing") ) ); // default is N, so null sets missing to false
        cat.setDataVariable(dv);
                
        if (dv.getCategories() == null || dv.getCategories().size() == 0) {
            // if this is the first category we encounter, we'll assume that this
            // categorical data/"factor" variable is ordered. 
            // But we'll switch it back to unordered later, if we encounter
            // *any* categories with no order attribute defined. 
            
            dv.setOrderedCategorical(true);
        } 
        
        
        // Process extra level order values, if available; 
        // Currently (as of 3.6) only available in R Data ingests.
        // TODO: 
        // revisit this (for 4.0) - we've discussed encoding this order
        // simply by the order in which the categories appear in the 
        // DDI. (-- L.A. 4.0 beta 9)
        
        String order = null; 
        order = xmlr.getAttributeValue(null, "order");
        Integer orderValue = null; 
        if (order != null) {
            try {
                orderValue = new Integer (order);
            } catch (NumberFormatException ex) {
                orderValue = null; 
            }
        }

        if (orderValue != null && orderValue.intValue() >= 0) {
            cat.setOrder(orderValue.intValue());
        } else if (!cat.isMissing()) {
            // Everey category of an ordered categorical ("factor") variable
            // must have the order rank defined. Which means that if we 
            // encounter a single NON-MISSING category with no ordered attribute, it
            // will be processed as un-ordered. 

            dv.setOrderedCategorical(false);
        }


        dv.getCategories().add(cat);

        for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) {
            if (event == XMLStreamConstants.START_ELEMENT) {
                if (xmlr.getLocalName().equals("labl")) {
                    String _labl = processLabl( xmlr, LEVEL_CATEGORY );
                    if (_labl != null && !_labl.equals("") ) {
                        cat.setLabel( _labl );
                    }
                } else if (xmlr.getLocalName().equals("catValu")) {
                    cat.setValue( parseText(xmlr, false) );
                }
                else if (xmlr.getLocalName().equals("catStat")) {
                    String type = xmlr.getAttributeValue(null, "type");
                    if (type == null || CAT_STAT_TYPE_FREQUENCY.equalsIgnoreCase( type ) ) {
                        String _freq = parseText(xmlr);
                        if (_freq != null && !_freq.equals("") ) {
                            cat.setFrequency( new Double( _freq ) );
                        }
                    }
                }
            } else if (event == XMLStreamConstants.END_ELEMENT) {
                if (xmlr.getLocalName().equals("catgry")) return;
            }
        }
    }

    private String processLabl(XMLStreamReader xmlr, String level) throws XMLStreamException {
        if (level.equalsIgnoreCase( xmlr.getAttributeValue(null, "level") ) ) {
            return parseText(xmlr);
        } else {
            return null;
        }
    }

        
    private String parseNoteByType (XMLStreamReader xmlr, String type) throws XMLStreamException {
        if (type.equalsIgnoreCase( xmlr.getAttributeValue(null, "type") ) ) {
            return parseText(xmlr);
        } else {
            return null;
        }
    }

    private String parseUNF(String unfString) {
        if (unfString.indexOf("UNF:") != -1) {
            return unfString.substring( unfString.indexOf("UNF:") );
        } else {
            return null;
        }
    }

     private String parseText(XMLStreamReader xmlr) throws XMLStreamException {
        return parseText(xmlr,true);
     }

     private String parseText(XMLStreamReader xmlr, boolean scrubText) throws XMLStreamException {
        String tempString = getElementText(xmlr);
        if (scrubText) {
            tempString = tempString.trim().replace('\n',' ');
        }
        return tempString;
     }
    
     /* We had to add this method because the ref getElementText has a bug where it
     * would append a null before the text, if there was an escaped apostrophe; it appears
     * that the code finds an null ENTITY_REFERENCE in this case which seems like a bug;
     * the workaround for the moment is to comment or handling ENTITY_REFERENCE in this case
     */
     /* 
      * TODO: do we still need this method? ( -- L.A. 4.0 beta 9)
      */
    private String getElementText(XMLStreamReader xmlr) throws XMLStreamException {
        if(xmlr.getEventType() != XMLStreamConstants.START_ELEMENT) {
            throw new XMLStreamException("parser must be on START_ELEMENT to read next text", xmlr.getLocation());
        }
        int eventType = xmlr.next();
        StringBuilder content = new StringBuilder();
        while(eventType != XMLStreamConstants.END_ELEMENT ) {
            if(eventType == XMLStreamConstants.CHARACTERS
            || eventType == XMLStreamConstants.CDATA
            || eventType == XMLStreamConstants.SPACE) {
                content.append(xmlr.getText());
            } else if(eventType == XMLStreamConstants.PROCESSING_INSTRUCTION
                || eventType == XMLStreamConstants.COMMENT
                || eventType == XMLStreamConstants.ENTITY_REFERENCE) {
                // skipping
            } else if(eventType == XMLStreamConstants.END_DOCUMENT) {
                throw new XMLStreamException("unexpected end of document when reading element text content");
            } else if(eventType == XMLStreamConstants.START_ELEMENT) {
                throw new XMLStreamException("element text content may not contain START_ELEMENT", xmlr.getLocation());
            } else {
                throw new XMLStreamException("Unexpected event type "+eventType, xmlr.getLocation());
            }
            eventType = xmlr.next();
        }
        return content.toString();
    }
}