comparison src/main/java/edu/harvard/iq/dataverse/IndexServiceBean.java @ 10:a50cf11e5178

Rewrite LGDataverse completely upgrading to dataverse4.0
author Zoe Hong <zhong@mpiwg-berlin.mpg.de>
date Tue, 08 Sep 2015 17:00:21 +0200
parents
children
comparison
equal deleted inserted replaced
9:5926d6419569 10:a50cf11e5178
1 package edu.harvard.iq.dataverse;
2
3 import edu.harvard.iq.dataverse.util.StringUtil;
4 import edu.harvard.iq.dataverse.authorization.providers.builtin.BuiltinUserServiceBean;
5 import edu.harvard.iq.dataverse.search.SearchFields;
6 import edu.harvard.iq.dataverse.authorization.AuthenticationServiceBean;
7 import edu.harvard.iq.dataverse.datavariable.DataVariable;
8 import edu.harvard.iq.dataverse.search.IndexResponse;
9 import edu.harvard.iq.dataverse.search.IndexableDataset;
10 import edu.harvard.iq.dataverse.search.IndexableObject;
11 import edu.harvard.iq.dataverse.search.SearchException;
12 import edu.harvard.iq.dataverse.search.SearchPermissionsServiceBean;
13 import edu.harvard.iq.dataverse.search.SolrIndexServiceBean;
14 import edu.harvard.iq.dataverse.util.FileUtil;
15 import edu.harvard.iq.dataverse.util.SystemConfig;
16 import java.io.IOException;
17 import java.sql.Timestamp;
18 import java.text.DateFormat;
19 import java.text.SimpleDateFormat;
20 import java.util.ArrayList;
21 import java.util.Calendar;
22 import java.util.Collection;
23 import java.util.Date;
24 import java.util.HashSet;
25 import java.util.LinkedHashMap;
26 import java.util.List;
27 import java.util.Locale;
28 import java.util.Map;
29 import java.util.Set;
30 import java.util.concurrent.Future;
31 import java.util.logging.Logger;
32 import javax.ejb.AsyncResult;
33 import javax.ejb.EJB;
34 import javax.ejb.EJBException;
35 import javax.ejb.Stateless;
36 import javax.ejb.TransactionAttribute;
37 import static javax.ejb.TransactionAttributeType.REQUIRES_NEW;
38 import javax.inject.Named;
39 import org.apache.solr.client.solrj.SolrQuery;
40 import org.apache.solr.client.solrj.SolrServer;
41 import org.apache.solr.client.solrj.SolrServerException;
42 import org.apache.solr.client.solrj.impl.HttpSolrServer;
43 import org.apache.solr.client.solrj.response.QueryResponse;
44 import org.apache.solr.client.solrj.response.UpdateResponse;
45 import org.apache.solr.common.SolrDocument;
46 import org.apache.solr.common.SolrDocumentList;
47 import org.apache.solr.common.SolrInputDocument;
48
49 @Stateless
50 @Named
51 public class IndexServiceBean {
52
53 private static final Logger logger = Logger.getLogger(IndexServiceBean.class.getCanonicalName());
54
55 @EJB
56 DvObjectServiceBean dvObjectService;
57 @EJB
58 DataverseServiceBean dataverseService;
59 @EJB
60 DatasetServiceBean datasetService;
61 @EJB
62 BuiltinUserServiceBean dataverseUserServiceBean;
63 @EJB
64 PermissionServiceBean permissionService;
65 @EJB
66 AuthenticationServiceBean userServiceBean;
67 @EJB
68 SystemConfig systemConfig;
69 @EJB
70 SearchPermissionsServiceBean searchPermissionsService;
71 @EJB
72 SolrIndexServiceBean solrIndexService;
73 @EJB
74 DatasetLinkingServiceBean dsLinkingService;
75 @EJB
76 DataverseLinkingServiceBean dvLinkingService;
77
78 public static final String solrDocIdentifierDataverse = "dataverse_";
79 public static final String solrDocIdentifierFile = "datafile_";
80 public static final String solrDocIdentifierDataset = "dataset_";
81 public static final String draftSuffix = "_draft";
82 public static final String deaccessionedSuffix = "_deaccessioned";
83 public static final String discoverabilityPermissionSuffix = "_permission";
84 private static final String groupPrefix = "group_";
85 private static final String groupPerUserPrefix = "group_user";
86 private static final String publicGroupIdString = "public";
87 private static final String publicGroupString = groupPrefix + "public";
88 private static final String PUBLISHED_STRING = "Published";
89 private static final String UNPUBLISHED_STRING = "Unpublished";
90 private static final String DRAFT_STRING = "Draft";
91 private static final String DEACCESSIONED_STRING = "Deaccessioned";
92 private Dataverse rootDataverseCached;
93
94 @TransactionAttribute(REQUIRES_NEW)
95 public Future<String> indexDataverseInNewTransaction(Dataverse dataverse) {
96 return indexDataverse(dataverse);
97 }
98
99 public Future<String> indexDataverse(Dataverse dataverse) {
100 logger.fine("indexDataverse called on dataverse id " + dataverse.getId() + "(" + dataverse.getAlias() + ")");
101 if (dataverse.getId() == null) {
102 String msg = "unable to index dataverse. id was null (alias: " + dataverse.getAlias() + ")";
103 logger.info(msg);
104 return new AsyncResult<>(msg);
105 }
106 Dataverse rootDataverse = findRootDataverseCached();
107 if (rootDataverse == null) {
108 String msg = "Could not find root dataverse and the root dataverse should not be indexed. Returning.";
109 return new AsyncResult<>(msg);
110 } else {
111 if (dataverse.getId() == rootDataverse.getId()) {
112 String msg = "The root dataverse should not be indexed. Returning.";
113 return new AsyncResult<>(msg);
114 }
115 }
116 Collection<SolrInputDocument> docs = new ArrayList<>();
117 SolrInputDocument solrInputDocument = new SolrInputDocument();
118 solrInputDocument.addField(SearchFields.ID, solrDocIdentifierDataverse + dataverse.getId());
119 solrInputDocument.addField(SearchFields.ENTITY_ID, dataverse.getId());
120 solrInputDocument.addField(SearchFields.IDENTIFIER, dataverse.getAlias());
121 solrInputDocument.addField(SearchFields.TYPE, "dataverses");
122 solrInputDocument.addField(SearchFields.NAME, dataverse.getName());
123 solrInputDocument.addField(SearchFields.NAME_SORT, dataverse.getName());
124 solrInputDocument.addField(SearchFields.DATAVERSE_NAME, dataverse.getName());
125 solrInputDocument.addField(SearchFields.DATAVERSE_CATEGORY, dataverse.getIndexableCategoryName());
126 if (dataverse.isReleased()) {
127 solrInputDocument.addField(SearchFields.PUBLICATION_STATUS, PUBLISHED_STRING);
128 solrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE, dataverse.getPublicationDate());
129 solrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE_SEARCHABLE_TEXT, convertToFriendlyDate(dataverse.getPublicationDate()));
130 } else {
131 solrInputDocument.addField(SearchFields.PUBLICATION_STATUS, UNPUBLISHED_STRING);
132 solrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE, dataverse.getCreateDate());
133 solrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE_SEARCHABLE_TEXT, convertToFriendlyDate(dataverse.getCreateDate()));
134 }
135
136 addDataverseReleaseDateToSolrDoc(solrInputDocument, dataverse);
137 // if (dataverse.getOwner() != null) {
138 // solrInputDocument.addField(SearchFields.HOST_DATAVERSE, dataverse.getOwner().getName());
139 // }
140 solrInputDocument.addField(SearchFields.DESCRIPTION, StringUtil.html2text(dataverse.getDescription()));
141 solrInputDocument.addField(SearchFields.DATAVERSE_DESCRIPTION, StringUtil.html2text(dataverse.getDescription()));
142 // logger.info("dataverse affiliation: " + dataverse.getAffiliation());
143 if (dataverse.getAffiliation() != null && !dataverse.getAffiliation().isEmpty()) {
144 /**
145 * @todo: stop using affiliation as category
146 */
147 // solrInputDocument.addField(SearchFields.CATEGORY, dataverse.getAffiliation());
148 solrInputDocument.addField(SearchFields.AFFILIATION, dataverse.getAffiliation());
149 solrInputDocument.addField(SearchFields.DATAVERSE_AFFILIATION, dataverse.getAffiliation());
150 }
151 for (ControlledVocabularyValue dataverseSubject : dataverse.getDataverseSubjects()) {
152 String subject = dataverseSubject.getStrValue();
153 if (!subject.equals(DatasetField.NA_VALUE)) {
154 solrInputDocument.addField(SearchFields.DATAVERSE_SUBJECT, subject);
155 // collapse into shared "subject" field used as a facet
156 solrInputDocument.addField(SearchFields.SUBJECT, subject);
157 }
158 }
159 // checking for NPE is important so we can create the root dataverse
160 if (rootDataverse != null && !dataverse.equals(rootDataverse)) {
161 // important when creating root dataverse
162 if (dataverse.getOwner() != null) {
163 solrInputDocument.addField(SearchFields.PARENT_ID, dataverse.getOwner().getId());
164 solrInputDocument.addField(SearchFields.PARENT_NAME, dataverse.getOwner().getName());
165 }
166 }
167 List<String> dataversePathSegmentsAccumulator = new ArrayList<>();
168 List<String> dataverseSegments = findPathSegments(dataverse, dataversePathSegmentsAccumulator);
169 List<String> dataversePaths = getDataversePathsFromSegments(dataverseSegments);
170 if (dataversePaths.size() > 0) {
171 // don't show yourself while indexing or in search results: https://redmine.hmdc.harvard.edu/issues/3613
172 // logger.info(dataverse.getName() + " size " + dataversePaths.size());
173 dataversePaths.remove(dataversePaths.size() - 1);
174 }
175 //Add paths for linking dataverses
176 for (Dataverse linkingDataverse : dvLinkingService.findLinkingDataverses(dataverse.getId())) {
177 List<String> linkingDataversePathSegmentsAccumulator = new ArrayList<>();
178 List<String> linkingdataverseSegments = findPathSegments(linkingDataverse, linkingDataversePathSegmentsAccumulator);
179 List<String> linkingDataversePaths = getDataversePathsFromSegments(linkingdataverseSegments);
180 for (String dvPath : linkingDataversePaths) {
181 dataversePaths.add(dvPath);
182 }
183 }
184 solrInputDocument.addField(SearchFields.SUBTREE, dataversePaths);
185 docs.add(solrInputDocument);
186
187 SolrServer server = new HttpSolrServer("http://" + systemConfig.getSolrHostColonPort() + "/solr");
188
189 String status;
190 try {
191 if (dataverse.getId() != null) {
192 server.add(docs);
193 } else {
194 logger.info("WARNING: indexing of a dataverse with no id attempted");
195 }
196 } catch (SolrServerException | IOException ex) {
197 status = ex.toString();
198 logger.info(status);
199 return new AsyncResult<>(status);
200 }
201 try {
202 server.commit();
203 } catch (SolrServerException | IOException ex) {
204 status = ex.toString();
205 logger.info(status);
206 return new AsyncResult<>(status);
207 }
208
209 dvObjectService.updateContentIndexTime(dataverse);
210 IndexResponse indexResponse = solrIndexService.indexPermissionsForOneDvObject(dataverse.getId());
211 String msg = "indexed dataverse " + dataverse.getId() + ":" + dataverse.getAlias() + ". Response from permission indexing: " + indexResponse.getMessage();
212 return new AsyncResult<>(msg);
213
214 }
215
216 @TransactionAttribute(REQUIRES_NEW)
217 public Future<String> indexDatasetInNewTransaction(Dataset dataset) {
218 boolean doNormalSolrDocCleanUp = false;
219 return indexDataset(dataset, doNormalSolrDocCleanUp);
220 }
221
222 public Future<String> indexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) {
223 logger.fine("indexing dataset " + dataset.getId());
224 /**
225 * @todo should we use solrDocIdentifierDataset or
226 * IndexableObject.IndexableTypes.DATASET.getName() + "_" ?
227 */
228 // String solrIdPublished = solrDocIdentifierDataset + dataset.getId();
229 String solrIdPublished = determinePublishedDatasetSolrDocId(dataset);
230 String solrIdDraftDataset = IndexableObject.IndexableTypes.DATASET.getName() + "_" + dataset.getId() + IndexableDataset.DatasetState.WORKING_COPY.getSuffix();
231 // String solrIdDeaccessioned = IndexableObject.IndexableTypes.DATASET.getName() + "_" + dataset.getId() + IndexableDataset.DatasetState.DEACCESSIONED.getSuffix();
232 String solrIdDeaccessioned = determineDeaccessionedDatasetId(dataset);
233 StringBuilder debug = new StringBuilder();
234 debug.append("\ndebug:\n");
235 int numPublishedVersions = 0;
236 List<DatasetVersion> versions = dataset.getVersions();
237 List<String> solrIdsOfFilesToDelete = new ArrayList<>();
238 for (DatasetVersion datasetVersion : versions) {
239 Long versionDatabaseId = datasetVersion.getId();
240 String versionTitle = datasetVersion.getTitle();
241 String semanticVersion = datasetVersion.getSemanticVersion();
242 DatasetVersion.VersionState versionState = datasetVersion.getVersionState();
243 if (versionState.equals(DatasetVersion.VersionState.RELEASED)) {
244 numPublishedVersions += 1;
245 }
246 debug.append("version found with database id " + versionDatabaseId + "\n");
247 debug.append("- title: " + versionTitle + "\n");
248 debug.append("- semanticVersion-VersionState: " + semanticVersion + "-" + versionState + "\n");
249 List<FileMetadata> fileMetadatas = datasetVersion.getFileMetadatas();
250 List<String> fileInfo = new ArrayList<>();
251 for (FileMetadata fileMetadata : fileMetadatas) {
252 String solrIdOfPublishedFile = solrDocIdentifierFile + fileMetadata.getDataFile().getId();
253 /**
254 * It sounds weird but the first thing we'll do is preemptively
255 * delete the Solr documents of all published files. Don't
256 * worry, published files will be re-indexed later along with
257 * the dataset. We do this so users can delete files from
258 * published versions of datasets and then re-publish a new
259 * version without fear that their old published files (now
260 * deleted from the latest published version) will be
261 * searchable. See also
262 * https://github.com/IQSS/dataverse/issues/762
263 */
264 solrIdsOfFilesToDelete.add(solrIdOfPublishedFile);
265 fileInfo.add(fileMetadata.getDataFile().getId() + ":" + fileMetadata.getLabel());
266 }
267 int numFiles = 0;
268 if (fileMetadatas != null) {
269 numFiles = fileMetadatas.size();
270 }
271 debug.append("- files: " + numFiles + " " + fileInfo.toString() + "\n");
272 }
273 debug.append("numPublishedVersions: " + numPublishedVersions + "\n");
274 if (doNormalSolrDocCleanUp) {
275 IndexResponse resultOfAttemptToPremptivelyDeletePublishedFiles = solrIndexService.deleteMultipleSolrIds(solrIdsOfFilesToDelete);
276 debug.append("result of attempt to premptively deleted published files before reindexing: " + resultOfAttemptToPremptivelyDeletePublishedFiles + "\n");
277 }
278 DatasetVersion latestVersion = dataset.getLatestVersion();
279 String latestVersionStateString = latestVersion.getVersionState().name();
280 DatasetVersion.VersionState latestVersionState = latestVersion.getVersionState();
281 DatasetVersion releasedVersion = dataset.getReleasedVersion();
282 boolean atLeastOnePublishedVersion = false;
283 if (releasedVersion != null) {
284 atLeastOnePublishedVersion = true;
285 } else {
286 atLeastOnePublishedVersion = false;
287 }
288 Map<DatasetVersion.VersionState, Boolean> desiredCards = new LinkedHashMap<>();
289 /**
290 * @todo refactor all of this below and have a single method that takes
291 * the map of desired cards (which correspond to Solr documents) as one
292 * of the arguments and does all the operations necessary to achieve the
293 * desired state.
294 */
295 StringBuilder results = new StringBuilder();
296 if (atLeastOnePublishedVersion == false) {
297 results.append("No published version, nothing will be indexed as ")
298 .append(solrIdPublished).append("\n");
299 if (latestVersionState.equals(DatasetVersion.VersionState.DRAFT)) {
300
301 desiredCards.put(DatasetVersion.VersionState.DRAFT, true);
302 IndexableDataset indexableDraftVersion = new IndexableDataset(latestVersion);
303 String indexDraftResult = addOrUpdateDataset(indexableDraftVersion);
304 results.append("The latest version is a working copy (latestVersionState: ")
305 .append(latestVersionStateString).append(") and indexing was attempted for ")
306 .append(solrIdDraftDataset).append(" (limited discoverability). Result: ")
307 .append(indexDraftResult).append("\n");
308
309 desiredCards.put(DatasetVersion.VersionState.DEACCESSIONED, false);
310 if (doNormalSolrDocCleanUp) {
311 String deleteDeaccessionedResult = removeDeaccessioned(dataset);
312 results.append("Draft exists, no need for deaccessioned version. Deletion attempted for ")
313 .append(solrIdDeaccessioned).append(" (and files). Result: ")
314 .append(deleteDeaccessionedResult).append("\n");
315 }
316
317 desiredCards.put(DatasetVersion.VersionState.RELEASED, false);
318 if (doNormalSolrDocCleanUp) {
319 String deletePublishedResults = removePublished(dataset);
320 results.append("No published version. Attempting to delete traces of published version from index. Result: ").
321 append(deletePublishedResults).append("\n");
322 }
323
324 /**
325 * Desired state for existence of cards: {DRAFT=true,
326 * DEACCESSIONED=false, RELEASED=false}
327 *
328 * No published version, nothing will be indexed as dataset_17
329 *
330 * The latest version is a working copy (latestVersionState:
331 * DRAFT) and indexing was attempted for dataset_17_draft
332 * (limited discoverability). Result: indexed dataset 17 as
333 * dataset_17_draft. filesIndexed: [datafile_18_draft]
334 *
335 * Draft exists, no need for deaccessioned version. Deletion
336 * attempted for dataset_17_deaccessioned (and files). Result:
337 * Attempted to delete dataset_17_deaccessioned from Solr index.
338 * updateReponse was:
339 * {responseHeader={status=0,QTime=1}}Attempted to delete
340 * datafile_18_deaccessioned from Solr index. updateReponse was:
341 * {responseHeader={status=0,QTime=1}}
342 *
343 * No published version. Attempting to delete traces of
344 * published version from index. Result: Attempted to delete
345 * dataset_17 from Solr index. updateReponse was:
346 * {responseHeader={status=0,QTime=1}}Attempted to delete
347 * datafile_18 from Solr index. updateReponse was:
348 * {responseHeader={status=0,QTime=0}}
349 */
350 String result = getDesiredCardState(desiredCards) + results.toString() + debug.toString();
351 logger.fine(result);
352 indexDatasetPermissions(dataset);
353 return new AsyncResult<>(result);
354 } else if (latestVersionState.equals(DatasetVersion.VersionState.DEACCESSIONED)) {
355
356 desiredCards.put(DatasetVersion.VersionState.DEACCESSIONED, true);
357 IndexableDataset indexableDeaccessionedVersion = new IndexableDataset(latestVersion);
358 String indexDeaccessionedVersionResult = addOrUpdateDataset(indexableDeaccessionedVersion);
359 results.append("No draft version. Attempting to index as deaccessioned. Result: ").append(indexDeaccessionedVersionResult).append("\n");
360
361 desiredCards.put(DatasetVersion.VersionState.RELEASED, false);
362 if (doNormalSolrDocCleanUp) {
363 String deletePublishedResults = removePublished(dataset);
364 results.append("No published version. Attempting to delete traces of published version from index. Result: ").
365 append(deletePublishedResults).append("\n");
366 }
367
368 desiredCards.put(DatasetVersion.VersionState.DRAFT, false);
369 if (doNormalSolrDocCleanUp) {
370 List<String> solrDocIdsForDraftFilesToDelete = findSolrDocIdsForDraftFilesToDelete(dataset);
371 String deleteDraftDatasetVersionResult = removeSolrDocFromIndex(solrIdDraftDataset);
372 String deleteDraftFilesResults = deleteDraftFiles(solrDocIdsForDraftFilesToDelete);
373 results.append("Attempting to delete traces of drafts. Result: ")
374 .append(deleteDraftDatasetVersionResult).append(deleteDraftFilesResults).append("\n");
375 }
376
377 /**
378 * Desired state for existence of cards: {DEACCESSIONED=true,
379 * RELEASED=false, DRAFT=false}
380 *
381 * No published version, nothing will be indexed as dataset_17
382 *
383 * No draft version. Attempting to index as deaccessioned.
384 * Result: indexed dataset 17 as dataset_17_deaccessioned.
385 * filesIndexed: []
386 *
387 * No published version. Attempting to delete traces of
388 * published version from index. Result: Attempted to delete
389 * dataset_17 from Solr index. updateReponse was:
390 * {responseHeader={status=0,QTime=0}}Attempted to delete
391 * datafile_18 from Solr index. updateReponse was:
392 * {responseHeader={status=0,QTime=3}}
393 *
394 * Attempting to delete traces of drafts. Result: Attempted to
395 * delete dataset_17_draft from Solr index. updateReponse was:
396 * {responseHeader={status=0,QTime=1}}
397 */
398 String result = getDesiredCardState(desiredCards) + results.toString() + debug.toString();
399 logger.fine(result);
400 indexDatasetPermissions(dataset);
401 return new AsyncResult<>(result);
402 } else {
403 String result = "No-op. Unexpected condition reached: No released version and latest version is neither draft nor deaccessioned";
404 logger.fine(result);
405 return new AsyncResult<>(result);
406 }
407 } else if (atLeastOnePublishedVersion == true) {
408 results.append("Published versions found. ")
409 .append("Will attempt to index as ").append(solrIdPublished).append(" (discoverable by anonymous)\n");
410 if (latestVersionState.equals(DatasetVersion.VersionState.RELEASED)
411 || latestVersionState.equals(DatasetVersion.VersionState.DEACCESSIONED)) {
412
413 desiredCards.put(DatasetVersion.VersionState.RELEASED, true);
414 IndexableDataset indexableReleasedVersion = new IndexableDataset(releasedVersion);
415 String indexReleasedVersionResult = addOrUpdateDataset(indexableReleasedVersion);
416 results.append("Attempted to index " + solrIdPublished).append(". Result: ").append(indexReleasedVersionResult).append("\n");
417
418 desiredCards.put(DatasetVersion.VersionState.DRAFT, false);
419 if (doNormalSolrDocCleanUp) {
420 List<String> solrDocIdsForDraftFilesToDelete = findSolrDocIdsForDraftFilesToDelete(dataset);
421 String deleteDraftDatasetVersionResult = removeSolrDocFromIndex(solrIdDraftDataset);
422 String deleteDraftFilesResults = deleteDraftFiles(solrDocIdsForDraftFilesToDelete);
423 results.append("The latest version is published. Attempting to delete drafts. Result: ")
424 .append(deleteDraftDatasetVersionResult).append(deleteDraftFilesResults).append("\n");
425 }
426
427 desiredCards.put(DatasetVersion.VersionState.DEACCESSIONED, false);
428 if (doNormalSolrDocCleanUp) {
429 String deleteDeaccessionedResult = removeDeaccessioned(dataset);
430 results.append("No need for deaccessioned version. Deletion attempted for ")
431 .append(solrIdDeaccessioned).append(". Result: ").append(deleteDeaccessionedResult);
432 }
433
434 /**
435 * Desired state for existence of cards: {RELEASED=true,
436 * DRAFT=false, DEACCESSIONED=false}
437 *
438 * Released versions found: 1. Will attempt to index as
439 * dataset_17 (discoverable by anonymous)
440 *
441 * Attempted to index dataset_17. Result: indexed dataset 17 as
442 * dataset_17. filesIndexed: [datafile_18]
443 *
444 * The latest version is published. Attempting to delete drafts.
445 * Result: Attempted to delete dataset_17_draft from Solr index.
446 * updateReponse was: {responseHeader={status=0,QTime=1}}
447 *
448 * No need for deaccessioned version. Deletion attempted for
449 * dataset_17_deaccessioned. Result: Attempted to delete
450 * dataset_17_deaccessioned from Solr index. updateReponse was:
451 * {responseHeader={status=0,QTime=1}}Attempted to delete
452 * datafile_18_deaccessioned from Solr index. updateReponse was:
453 * {responseHeader={status=0,QTime=0}}
454 */
455 String result = getDesiredCardState(desiredCards) + results.toString() + debug.toString();
456 logger.fine(result);
457 indexDatasetPermissions(dataset);
458 return new AsyncResult<>(result);
459 } else if (latestVersionState.equals(DatasetVersion.VersionState.DRAFT)) {
460
461 IndexableDataset indexableDraftVersion = new IndexableDataset(latestVersion);
462 desiredCards.put(DatasetVersion.VersionState.DRAFT, true);
463 String indexDraftResult = addOrUpdateDataset(indexableDraftVersion);
464 results.append("The latest version is a working copy (latestVersionState: ")
465 .append(latestVersionStateString).append(") and will be indexed as ")
466 .append(solrIdDraftDataset).append(" (limited visibility). Result: ").append(indexDraftResult).append("\n");
467
468 desiredCards.put(DatasetVersion.VersionState.RELEASED, true);
469 IndexableDataset indexableReleasedVersion = new IndexableDataset(releasedVersion);
470 String indexReleasedVersionResult = addOrUpdateDataset(indexableReleasedVersion);
471 results.append("There is a published version we will attempt to index. Result: ").append(indexReleasedVersionResult).append("\n");
472
473 desiredCards.put(DatasetVersion.VersionState.DEACCESSIONED, false);
474 if (doNormalSolrDocCleanUp) {
475 String deleteDeaccessionedResult = removeDeaccessioned(dataset);
476 results.append("No need for deaccessioned version. Deletion attempted for ")
477 .append(solrIdDeaccessioned).append(". Result: ").append(deleteDeaccessionedResult);
478 }
479
480 /**
481 * Desired state for existence of cards: {DRAFT=true,
482 * RELEASED=true, DEACCESSIONED=false}
483 *
484 * Released versions found: 1. Will attempt to index as
485 * dataset_17 (discoverable by anonymous)
486 *
487 * The latest version is a working copy (latestVersionState:
488 * DRAFT) and will be indexed as dataset_17_draft (limited
489 * visibility). Result: indexed dataset 17 as dataset_17_draft.
490 * filesIndexed: [datafile_18_draft]
491 *
492 * There is a published version we will attempt to index.
493 * Result: indexed dataset 17 as dataset_17. filesIndexed:
494 * [datafile_18]
495 *
496 * No need for deaccessioned version. Deletion attempted for
497 * dataset_17_deaccessioned. Result: Attempted to delete
498 * dataset_17_deaccessioned from Solr index. updateReponse was:
499 * {responseHeader={status=0,QTime=1}}Attempted to delete
500 * datafile_18_deaccessioned from Solr index. updateReponse was:
501 * {responseHeader={status=0,QTime=0}}
502 */
503 String result = getDesiredCardState(desiredCards) + results.toString() + debug.toString();
504 logger.fine(result);
505 indexDatasetPermissions(dataset);
506 return new AsyncResult<>(result);
507 } else {
508 String result = "No-op. Unexpected condition reached: There is at least one published version but the latest version is neither published nor draft";
509 logger.fine(result);
510 return new AsyncResult<>(result);
511 }
512 } else {
513 String result = "No-op. Unexpected condition reached: Has a version been published or not?";
514 logger.fine(result);
515 return new AsyncResult<>(result);
516 }
517 }
518
519 private String deleteDraftFiles(List<String> solrDocIdsForDraftFilesToDelete) {
520 String deleteDraftFilesResults = "";
521 IndexResponse indexResponse = solrIndexService.deleteMultipleSolrIds(solrDocIdsForDraftFilesToDelete);
522 deleteDraftFilesResults = indexResponse.toString();
523 return deleteDraftFilesResults;
524 }
525
526 private IndexResponse indexDatasetPermissions(Dataset dataset) {
527 IndexResponse indexResponse = solrIndexService.indexPermissionsOnSelfAndChildren(dataset);
528 return indexResponse;
529 }
530
531 private String addOrUpdateDataset(IndexableDataset indexableDataset) {
532 IndexableDataset.DatasetState state = indexableDataset.getDatasetState();
533 Dataset dataset = indexableDataset.getDatasetVersion().getDataset();
534 logger.fine("adding or updating Solr document for dataset id " + dataset.getId());
535 Collection<SolrInputDocument> docs = new ArrayList<>();
536 List<String> dataversePathSegmentsAccumulator = new ArrayList<>();
537 List<String> dataverseSegments = new ArrayList<>();
538 try {
539 dataverseSegments = findPathSegments(dataset.getOwner(), dataversePathSegmentsAccumulator);
540 } catch (Exception ex) {
541 logger.info("failed to find dataverseSegments for dataversePaths for " + SearchFields.SUBTREE + ": " + ex);
542 }
543 List<String> dataversePaths = getDataversePathsFromSegments(dataverseSegments);
544 //Add Paths for linking dataverses
545 for (Dataverse linkingDataverse : dsLinkingService.findLinkingDataverses(dataset.getId())) {
546 List<String> linkingDataversePathSegmentsAccumulator = new ArrayList<>();
547 List<String> linkingdataverseSegments = findPathSegments(linkingDataverse, linkingDataversePathSegmentsAccumulator);
548 List<String> linkingDataversePaths = getDataversePathsFromSegments(linkingdataverseSegments);
549 for (String dvPath : linkingDataversePaths) {
550 dataversePaths.add(dvPath);
551 }
552 }
553 SolrInputDocument solrInputDocument = new SolrInputDocument();
554 String datasetSolrDocId = indexableDataset.getSolrDocId();
555 solrInputDocument.addField(SearchFields.ID, datasetSolrDocId);
556 solrInputDocument.addField(SearchFields.ENTITY_ID, dataset.getId());
557 solrInputDocument.addField(SearchFields.IDENTIFIER, dataset.getGlobalId());
558 solrInputDocument.addField(SearchFields.DATASET_PERSISTENT_ID, dataset.getGlobalId());
559 solrInputDocument.addField(SearchFields.PERSISTENT_URL, dataset.getPersistentURL());
560 solrInputDocument.addField(SearchFields.TYPE, "datasets");
561
562 Date datasetSortByDate = new Date();
563 Date majorVersionReleaseDate = dataset.getMostRecentMajorVersionReleaseDate();
564 if (majorVersionReleaseDate != null) {
565 if (true) {
566 String msg = "major release date found: " + majorVersionReleaseDate.toString();
567 logger.fine(msg);
568 }
569 datasetSortByDate = majorVersionReleaseDate;
570 } else {
571 if (indexableDataset.getDatasetState().equals(IndexableDataset.DatasetState.WORKING_COPY)) {
572 solrInputDocument.addField(SearchFields.PUBLICATION_STATUS, UNPUBLISHED_STRING);
573 } else if (indexableDataset.getDatasetState().equals(IndexableDataset.DatasetState.DEACCESSIONED)) {
574 // uncomment this if we change our mind and want a deaccessioned facet after all
575 // solrInputDocument.addField(SearchFields.PUBLICATION_STATUS, DEACCESSIONED_STRING);
576 }
577 Date createDate = dataset.getCreateDate();
578 if (createDate != null) {
579 if (true) {
580 String msg = "can't find major release date, using create date: " + createDate;
581 logger.fine(msg);
582 }
583 datasetSortByDate = createDate;
584 } else {
585 String msg = "can't find major release date or create date, using \"now\"";
586 logger.info(msg);
587 datasetSortByDate = new Date();
588 }
589 }
590 solrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE, datasetSortByDate);
591 solrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE_SEARCHABLE_TEXT, convertToFriendlyDate(datasetSortByDate));
592
593 if (state.equals(indexableDataset.getDatasetState().PUBLISHED)) {
594 solrInputDocument.addField(SearchFields.PUBLICATION_STATUS, PUBLISHED_STRING);
595 // solrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE, dataset.getPublicationDate());
596 } else if (state.equals(indexableDataset.getDatasetState().WORKING_COPY)) {
597 solrInputDocument.addField(SearchFields.PUBLICATION_STATUS, DRAFT_STRING);
598 }
599
600 addDatasetReleaseDateToSolrDoc(solrInputDocument, dataset);
601
602 DatasetVersion datasetVersion = indexableDataset.getDatasetVersion();
603 String parentDatasetTitle = "TBD";
604 if (datasetVersion != null) {
605
606 solrInputDocument.addField(SearchFields.DATASET_VERSION_ID, datasetVersion.getId());
607 solrInputDocument.addField(SearchFields.DATASET_CITATION, datasetVersion.getCitation(true));
608
609 for (DatasetField dsf : datasetVersion.getFlatDatasetFields()) {
610
611 DatasetFieldType dsfType = dsf.getDatasetFieldType();
612 String solrFieldSearchable = dsfType.getSolrField().getNameSearchable();
613 String solrFieldFacetable = dsfType.getSolrField().getNameFacetable();
614
615 if (dsf.getValues() != null && !dsf.getValues().isEmpty() && dsf.getValues().get(0) != null && solrFieldSearchable != null) {
616 logger.fine("indexing " + dsf.getDatasetFieldType().getName() + ":" + dsf.getValues() + " into " + solrFieldSearchable + " and maybe " + solrFieldFacetable);
617 // if (dsfType.getSolrField().getSolrType().equals(SolrField.SolrType.INTEGER)) {
618 if (dsfType.getSolrField().getSolrType().equals(SolrField.SolrType.EMAIL)) {
619 //no-op. we want to keep email address out of Solr per https://github.com/IQSS/dataverse/issues/759
620 } else if (dsfType.getSolrField().getSolrType().equals(SolrField.SolrType.DATE)) {
621 String dateAsString = dsf.getValues().get(0);
622 logger.fine("date as string: " + dateAsString);
623 if (dateAsString != null && !dateAsString.isEmpty()) {
624 SimpleDateFormat inputDateyyyy = new SimpleDateFormat("yyyy", Locale.ENGLISH);
625 try {
626 /**
627 * @todo when bean validation is working we
628 * won't have to convert strings into dates
629 */
630 logger.fine("Trying to convert " + dateAsString + " to a YYYY date from dataset " + dataset.getId());
631 Date dateAsDate = inputDateyyyy.parse(dateAsString);
632 SimpleDateFormat yearOnly = new SimpleDateFormat("yyyy");
633 String datasetFieldFlaggedAsDate = yearOnly.format(dateAsDate);
634 logger.fine("YYYY only: " + datasetFieldFlaggedAsDate);
635 // solrInputDocument.addField(solrFieldSearchable, Integer.parseInt(datasetFieldFlaggedAsDate));
636 solrInputDocument.addField(solrFieldSearchable, datasetFieldFlaggedAsDate);
637 if (dsfType.getSolrField().isFacetable()) {
638 // solrInputDocument.addField(solrFieldFacetable, Integer.parseInt(datasetFieldFlaggedAsDate));
639 solrInputDocument.addField(solrFieldFacetable, datasetFieldFlaggedAsDate);
640 }
641 } catch (Exception ex) {
642 logger.info("unable to convert " + dateAsString + " into YYYY format and couldn't index it (" + dsfType.getName() + ")");
643 }
644 }
645 } else {
646 // _s (dynamic string) and all other Solr fields
647
648 if (dsf.getDatasetFieldType().getName().equals("authorAffiliation")) {
649 /**
650 * @todo think about how to tie the fact that this
651 * needs to be multivalued (_ss) because a
652 * multivalued facet (authorAffilition_ss) is being
653 * collapsed into here at index time. The business
654 * logic to determine if a data-driven metadata
655 * field should be indexed into Solr as a single or
656 * multiple value lives in the getSolrField() method
657 * of DatasetField.java
658 */
659 solrInputDocument.addField(SearchFields.AFFILIATION, dsf.getValuesWithoutNaValues());
660 } else if (dsf.getDatasetFieldType().getName().equals("title")) {
661 // datasets have titles not names but index title under name as well so we can sort datasets by name along dataverses and files
662 List<String> possibleTitles = dsf.getValues();
663 String firstTitle = possibleTitles.get(0);
664 if (firstTitle != null) {
665 parentDatasetTitle = firstTitle;
666 }
667 solrInputDocument.addField(SearchFields.NAME_SORT, dsf.getValues());
668 }
669 if (dsfType.isControlledVocabulary()) {
670 for (ControlledVocabularyValue controlledVocabularyValue : dsf.getControlledVocabularyValues()) {
671 if (controlledVocabularyValue.getStrValue().equals(DatasetField.NA_VALUE)) {
672 continue;
673 }
674 solrInputDocument.addField(solrFieldSearchable, controlledVocabularyValue.getStrValue());
675 if (dsfType.getSolrField().isFacetable()) {
676 solrInputDocument.addField(solrFieldFacetable, controlledVocabularyValue.getStrValue());
677 }
678 }
679 } else {
680 if (dsfType.getFieldType().equals(DatasetFieldType.FieldType.TEXTBOX)) {
681 // strip HTML
682 List<String> htmlFreeText = StringUtil.htmlArray2textArray(dsf.getValuesWithoutNaValues());
683 solrInputDocument.addField(solrFieldSearchable, htmlFreeText);
684 if (dsfType.getSolrField().isFacetable()) {
685 solrInputDocument.addField(solrFieldFacetable, htmlFreeText);
686 }
687 } else {
688 // do not strip HTML
689 solrInputDocument.addField(solrFieldSearchable, dsf.getValuesWithoutNaValues());
690 if (dsfType.getSolrField().isFacetable()) {
691 solrInputDocument.addField(solrFieldFacetable, dsf.getValuesWithoutNaValues());
692 }
693 }
694 }
695 }
696 }
697 }
698 }
699
700 solrInputDocument.addField(SearchFields.SUBTREE, dataversePaths);
701 // solrInputDocument.addField(SearchFields.HOST_DATAVERSE, dataset.getOwner().getName());
702 solrInputDocument.addField(SearchFields.PARENT_ID, dataset.getOwner().getId());
703 solrInputDocument.addField(SearchFields.PARENT_NAME, dataset.getOwner().getName());
704
705 if (state.equals(indexableDataset.getDatasetState().DEACCESSIONED)) {
706 String deaccessionNote = datasetVersion.getVersionNote();
707 if (deaccessionNote != null) {
708 solrInputDocument.addField(SearchFields.DATASET_DEACCESSION_REASON, deaccessionNote);
709 }
710 }
711
712 docs.add(solrInputDocument);
713
714 List<String> filesIndexed = new ArrayList<>();
715 if (datasetVersion != null) {
716 List<FileMetadata> fileMetadatas = datasetVersion.getFileMetadatas();
717 boolean checkForDuplicateMetadata = false;
718 if (datasetVersion.isDraft() && dataset.isReleased() && dataset.getReleasedVersion() != null) {
719 checkForDuplicateMetadata = true;
720 logger.fine("We are indexing a draft version of a dataset that has a released version. We'll be checking file metadatas if they are exact clones of the released versions.");
721 }
722
723 for (FileMetadata fileMetadata : fileMetadatas) {
724 boolean indexThisMetadata = true;
725 if (checkForDuplicateMetadata) {
726 logger.fine("Checking if this file metadata is a duplicate.");
727 for (FileMetadata releasedFileMetadata : dataset.getReleasedVersion().getFileMetadatas()) {
728 if (fileMetadata.getDataFile() != null && fileMetadata.getDataFile().equals(releasedFileMetadata.getDataFile())) {
729 if (fileMetadata.contentEquals(releasedFileMetadata)) {
730 indexThisMetadata = false;
731 logger.fine("This file metadata hasn't changed since the released version; skipping indexing.");
732 } else {
733 logger.fine("This file metadata has changed since the released version; we want to index it!");
734 }
735 break;
736 }
737 }
738 }
739 if (indexThisMetadata) {
740 SolrInputDocument datafileSolrInputDocument = new SolrInputDocument();
741 Long fileEntityId = fileMetadata.getDataFile().getId();
742 datafileSolrInputDocument.addField(SearchFields.ENTITY_ID, fileEntityId);
743 datafileSolrInputDocument.addField(SearchFields.IDENTIFIER, fileEntityId);
744 datafileSolrInputDocument.addField(SearchFields.PERSISTENT_URL, dataset.getPersistentURL());
745 datafileSolrInputDocument.addField(SearchFields.TYPE, "files");
746
747 String filenameCompleteFinal = "";
748 if (fileMetadata != null) {
749 String filenameComplete = fileMetadata.getLabel();
750 if (filenameComplete != null) {
751 String filenameWithoutExtension = "";
752 // String extension = "";
753 int i = filenameComplete.lastIndexOf('.');
754 if (i > 0) {
755 // extension = filenameComplete.substring(i + 1);
756 try {
757 filenameWithoutExtension = filenameComplete.substring(0, i);
758 datafileSolrInputDocument.addField(SearchFields.FILENAME_WITHOUT_EXTENSION, filenameWithoutExtension);
759 datafileSolrInputDocument.addField(SearchFields.FILE_NAME, filenameWithoutExtension);
760 } catch (IndexOutOfBoundsException ex) {
761 filenameWithoutExtension = "";
762 }
763 } else {
764 logger.info("problem with filename '" + filenameComplete + "': no extension? empty string as filename?");
765 filenameWithoutExtension = filenameComplete;
766 }
767 filenameCompleteFinal = filenameComplete;
768 }
769 }
770 datafileSolrInputDocument.addField(SearchFields.NAME, filenameCompleteFinal);
771 datafileSolrInputDocument.addField(SearchFields.NAME_SORT, filenameCompleteFinal);
772 datafileSolrInputDocument.addField(SearchFields.FILE_NAME, filenameCompleteFinal);
773
774 datafileSolrInputDocument.addField(SearchFields.DATASET_VERSION_ID, datasetVersion.getId());
775
776 /**
777 * for rules on sorting files see
778 * https://docs.google.com/a/harvard.edu/document/d/1DWsEqT8KfheKZmMB3n_VhJpl9nIxiUjai_AIQPAjiyA/edit?usp=sharing
779 * via https://redmine.hmdc.harvard.edu/issues/3701
780 */
781 Date fileSortByDate = new Date();
782 DataFile datafile = fileMetadata.getDataFile();
783 if (datafile != null) {
784 boolean fileHasBeenReleased = datafile.isReleased();
785 if (fileHasBeenReleased) {
786 logger.fine("indexing file with filePublicationTimestamp. " + fileMetadata.getId() + " (file id " + datafile.getId() + ")");
787 Timestamp filePublicationTimestamp = datafile.getPublicationDate();
788 if (filePublicationTimestamp != null) {
789 fileSortByDate = filePublicationTimestamp;
790 } else {
791 String msg = "filePublicationTimestamp was null for fileMetadata id " + fileMetadata.getId() + " (file id " + datafile.getId() + ")";
792 logger.info(msg);
793 }
794 } else {
795 logger.fine("indexing file with fileCreateTimestamp. " + fileMetadata.getId() + " (file id " + datafile.getId() + ")");
796 Timestamp fileCreateTimestamp = datafile.getCreateDate();
797 if (fileCreateTimestamp != null) {
798 fileSortByDate = fileCreateTimestamp;
799 } else {
800 String msg = "fileCreateTimestamp was null for fileMetadata id " + fileMetadata.getId() + " (file id " + datafile.getId() + ")";
801 logger.info(msg);
802 }
803 }
804 }
805 if (fileSortByDate == null) {
806 if (datasetSortByDate != null) {
807 logger.info("fileSortByDate was null, assigning datasetSortByDate");
808 fileSortByDate = datasetSortByDate;
809 } else {
810 logger.info("fileSortByDate and datasetSortByDate were null, assigning 'now'");
811 fileSortByDate = new Date();
812 }
813 }
814 datafileSolrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE, fileSortByDate);
815 datafileSolrInputDocument.addField(SearchFields.RELEASE_OR_CREATE_DATE_SEARCHABLE_TEXT, convertToFriendlyDate(fileSortByDate));
816
817 if (majorVersionReleaseDate == null) {
818 datafileSolrInputDocument.addField(SearchFields.PUBLICATION_STATUS, UNPUBLISHED_STRING);
819 }
820
821 String fileSolrDocId = solrDocIdentifierFile + fileEntityId;
822 if (indexableDataset.getDatasetState().equals(indexableDataset.getDatasetState().PUBLISHED)) {
823 fileSolrDocId = solrDocIdentifierFile + fileEntityId;
824 datafileSolrInputDocument.addField(SearchFields.PUBLICATION_STATUS, PUBLISHED_STRING);
825 // datafileSolrInputDocument.addField(SearchFields.PERMS, publicGroupString);
826 addDatasetReleaseDateToSolrDoc(datafileSolrInputDocument, dataset);
827 } else if (indexableDataset.getDatasetState().equals(indexableDataset.getDatasetState().WORKING_COPY)) {
828 fileSolrDocId = solrDocIdentifierFile + fileEntityId + indexableDataset.getDatasetState().getSuffix();
829 datafileSolrInputDocument.addField(SearchFields.PUBLICATION_STATUS, DRAFT_STRING);
830 }
831 datafileSolrInputDocument.addField(SearchFields.ID, fileSolrDocId);
832
833 datafileSolrInputDocument.addField(SearchFields.FILE_TYPE_FRIENDLY, fileMetadata.getDataFile().getFriendlyType());
834 datafileSolrInputDocument.addField(SearchFields.FILE_CONTENT_TYPE, fileMetadata.getDataFile().getContentType());
835 datafileSolrInputDocument.addField(SearchFields.FILE_TYPE_SEARCHABLE, fileMetadata.getDataFile().getFriendlyType());
836 // For the file type facets, we have a property file that maps mime types
837 // to facet-friendly names; "application/fits" should become "FITS", etc.:
838 datafileSolrInputDocument.addField(SearchFields.FILE_TYPE, FileUtil.getFacetFileType(fileMetadata.getDataFile()));
839 datafileSolrInputDocument.addField(SearchFields.FILE_TYPE_SEARCHABLE, FileUtil.getFacetFileType(fileMetadata.getDataFile()));
840 datafileSolrInputDocument.addField(SearchFields.FILE_SIZE_IN_BYTES, fileMetadata.getDataFile().getFilesize());
841 datafileSolrInputDocument.addField(SearchFields.FILE_MD5, fileMetadata.getDataFile().getmd5());
842 datafileSolrInputDocument.addField(SearchFields.DESCRIPTION, fileMetadata.getDescription());
843 datafileSolrInputDocument.addField(SearchFields.FILE_DESCRIPTION, fileMetadata.getDescription());
844 datafileSolrInputDocument.addField(SearchFields.UNF, fileMetadata.getDataFile().getUnf());
845 datafileSolrInputDocument.addField(SearchFields.SUBTREE, dataversePaths);
846 // datafileSolrInputDocument.addField(SearchFields.HOST_DATAVERSE, dataFile.getOwner().getOwner().getName());
847 // datafileSolrInputDocument.addField(SearchFields.PARENT_NAME, dataFile.getDataset().getTitle());
848 datafileSolrInputDocument.addField(SearchFields.PARENT_ID, fileMetadata.getDataFile().getOwner().getId());
849 datafileSolrInputDocument.addField(SearchFields.PARENT_IDENTIFIER, fileMetadata.getDataFile().getOwner().getGlobalId());
850 datafileSolrInputDocument.addField(SearchFields.PARENT_CITATION, fileMetadata.getDataFile().getOwner().getCitation());
851
852 datafileSolrInputDocument.addField(SearchFields.PARENT_NAME, parentDatasetTitle);
853
854 // If this is a tabular data file -- i.e., if there are data
855 // variables associated with this file, we index the variable
856 // names and labels:
857 if (fileMetadata.getDataFile().isTabularData()) {
858 List<DataVariable> variables = fileMetadata.getDataFile().getDataTable().getDataVariables();
859 for (DataVariable var : variables) {
860 // Hard-coded search fields, for now:
861 // TODO: eventually: review, decide how datavariables should
862 // be handled for indexing purposes. (should it be a fixed
863 // setup, defined in the code? should it be flexible? unlikely
864 // that this needs to be domain-specific... since these data
865 // variables are quite specific to tabular data, which in turn
866 // is something social science-specific...
867 // anyway -- needs to be reviewed. -- L.A. 4.0alpha1
868
869 if (var.getName() != null && !var.getName().equals("")) {
870 datafileSolrInputDocument.addField(SearchFields.VARIABLE_NAME, var.getName());
871 }
872 if (var.getLabel() != null && !var.getLabel().equals("")) {
873 datafileSolrInputDocument.addField(SearchFields.VARIABLE_LABEL, var.getLabel());
874 }
875 }
876 }
877
878 if (indexableDataset.isFilesShouldBeIndexed()) {
879 filesIndexed.add(fileSolrDocId);
880 docs.add(datafileSolrInputDocument);
881 }
882 }
883 }
884 }
885
886 SolrServer server = new HttpSolrServer("http://" + systemConfig.getSolrHostColonPort() + "/solr");
887
888 try {
889 server.add(docs);
890 } catch (SolrServerException | IOException ex) {
891 return ex.toString();
892 }
893 try {
894 server.commit();
895 } catch (SolrServerException | IOException ex) {
896 return ex.toString();
897 }
898
899 dvObjectService.updateContentIndexTime(dataset);
900
901 // return "indexed dataset " + dataset.getId() + " as " + solrDocId + "\nindexFilesResults for " + solrDocId + ":" + fileInfo.toString();
902 return "indexed dataset " + dataset.getId() + " as " + datasetSolrDocId + ". filesIndexed: " + filesIndexed;
903 }
904
905 public List<String> findPathSegments(Dataverse dataverse, List<String> segments) {
906 Dataverse rootDataverse = findRootDataverseCached();
907 if (!dataverse.equals(rootDataverse)) {
908 // important when creating root dataverse
909 if (dataverse.getOwner() != null) {
910 findPathSegments(dataverse.getOwner(), segments);
911 }
912 segments.add(dataverse.getId().toString());
913 return segments;
914 } else {
915 // base case
916 return segments;
917 }
918 }
919
920 List<String> getDataversePathsFromSegments(List<String> dataversePathSegments) {
921 List<String> subtrees = new ArrayList<>();
922 for (int i = 0; i < dataversePathSegments.size(); i++) {
923 StringBuilder pathBuilder = new StringBuilder();
924 int numSegments = dataversePathSegments.size();
925 for (int j = 0; j < numSegments; j++) {
926 if (j <= i) {
927 pathBuilder.append("/" + dataversePathSegments.get(j));
928 }
929 }
930 subtrees.add(pathBuilder.toString());
931 }
932 return subtrees;
933 }
934
935 private void addDataverseReleaseDateToSolrDoc(SolrInputDocument solrInputDocument, Dataverse dataverse) {
936 if (dataverse.getPublicationDate() != null) {
937 Calendar calendar = Calendar.getInstance();
938 calendar.setTimeInMillis(dataverse.getPublicationDate().getTime());
939 int YYYY = calendar.get(Calendar.YEAR);
940 solrInputDocument.addField(SearchFields.PUBLICATION_DATE, YYYY);
941 }
942 }
943
944 private void addDatasetReleaseDateToSolrDoc(SolrInputDocument solrInputDocument, Dataset dataset) {
945 if (dataset.getPublicationDate() != null) {
946 Calendar calendar = Calendar.getInstance();
947 calendar.setTimeInMillis(dataset.getPublicationDate().getTime());
948 int YYYY = calendar.get(Calendar.YEAR);
949 solrInputDocument.addField(SearchFields.PUBLICATION_DATE, YYYY);
950 solrInputDocument.addField(SearchFields.DATASET_PUBLICATION_DATE, YYYY);
951 }
952 }
953
954 public static String getGroupPrefix() {
955 return groupPrefix;
956 }
957
958 public static String getGroupPerUserPrefix() {
959 return groupPerUserPrefix;
960 }
961
962 public static String getPublicGroupString() {
963 return publicGroupString;
964 }
965
966 public static String getPUBLISHED_STRING() {
967 return PUBLISHED_STRING;
968 }
969
970 public static String getUNPUBLISHED_STRING() {
971 return UNPUBLISHED_STRING;
972 }
973
974 public static String getDRAFT_STRING() {
975 return DRAFT_STRING;
976 }
977
978 public static String getDEACCESSIONED_STRING() {
979 return DEACCESSIONED_STRING;
980 }
981
982 public String delete(Dataverse doomed) {
983 SolrServer server = new HttpSolrServer("http://" + systemConfig.getSolrHostColonPort() + "/solr");
984
985 logger.fine("deleting Solr document for dataverse " + doomed.getId());
986 UpdateResponse updateResponse;
987 try {
988 updateResponse = server.deleteById(solrDocIdentifierDataverse + doomed.getId());
989 } catch (SolrServerException | IOException ex) {
990 return ex.toString();
991 }
992 try {
993 server.commit();
994 } catch (SolrServerException | IOException ex) {
995 return ex.toString();
996 }
997 String response = "Successfully deleted dataverse " + doomed.getId() + " from Solr index. updateReponse was: " + updateResponse.toString();
998 logger.fine(response);
999 return response;
1000 }
1001
1002 /**
1003 * @todo call this in fewer places, favoring
1004 * SolrIndexServiceBeans.deleteMultipleSolrIds instead to operate in batches
1005 *
1006 * https://github.com/IQSS/dataverse/issues/142
1007 */
1008 public String removeSolrDocFromIndex(String doomed) {
1009 SolrServer server = new HttpSolrServer("http://" + systemConfig.getSolrHostColonPort() + "/solr");
1010
1011 logger.fine("deleting Solr document: " + doomed);
1012 UpdateResponse updateResponse;
1013 try {
1014 updateResponse = server.deleteById(doomed);
1015 } catch (SolrServerException | IOException ex) {
1016 return ex.toString();
1017 }
1018 try {
1019 server.commit();
1020 } catch (SolrServerException | IOException ex) {
1021 return ex.toString();
1022 }
1023 String response = "Attempted to delete " + doomed + " from Solr index. updateReponse was: " + updateResponse.toString();
1024 logger.fine(response);
1025 return response;
1026 }
1027
1028 public String convertToFriendlyDate(Date dateAsDate) {
1029 if (dateAsDate == null) {
1030 dateAsDate = new Date();
1031 }
1032 // using DateFormat.MEDIUM for May 5, 2014 to match what's in DVN 3.x
1033 DateFormat format = DateFormat.getDateInstance(DateFormat.MEDIUM);
1034 String friendlyDate = format.format(dateAsDate);
1035 return friendlyDate;
1036 }
1037
1038 private List<String> findSolrDocIdsForDraftFilesToDelete(Dataset datasetWithDraftFilesToDelete) {
1039 List<String> solrIdsOfFilesToDelete = new ArrayList<>();
1040 for (DatasetVersion datasetVersion : datasetWithDraftFilesToDelete.getVersions()) {
1041 for (FileMetadata fileMetadata : datasetVersion.getFileMetadatas()) {
1042 DataFile datafile = fileMetadata.getDataFile();
1043 if (datafile != null) {
1044 solrIdsOfFilesToDelete.add(solrDocIdentifierFile + datafile.getId() + draftSuffix);
1045 }
1046 }
1047
1048 }
1049 return solrIdsOfFilesToDelete;
1050 }
1051
1052 private List<String> findSolrDocIdsForFilesToDelete(Dataset dataset, IndexableDataset.DatasetState state) {
1053 List<String> solrIdsOfFilesToDelete = new ArrayList<>();
1054 for (DataFile file : dataset.getFiles()) {
1055 solrIdsOfFilesToDelete.add(solrDocIdentifierFile + file.getId() + state.getSuffix());
1056 }
1057 return solrIdsOfFilesToDelete;
1058 }
1059
1060 private String removeMultipleSolrDocs(List<String> docIds) {
1061 IndexResponse indexResponse = solrIndexService.deleteMultipleSolrIds(docIds);
1062 return indexResponse.toString();
1063 }
1064
1065 private String determinePublishedDatasetSolrDocId(Dataset dataset) {
1066 return IndexableObject.IndexableTypes.DATASET.getName() + "_" + dataset.getId() + IndexableDataset.DatasetState.PUBLISHED.getSuffix();
1067 }
1068
1069 private String determineDeaccessionedDatasetId(Dataset dataset) {
1070 return IndexableObject.IndexableTypes.DATASET.getName() + "_" + dataset.getId() + IndexableDataset.DatasetState.DEACCESSIONED.getSuffix();
1071 }
1072
1073 private String removeDeaccessioned(Dataset dataset) {
1074 StringBuilder result = new StringBuilder();
1075 String deleteDeaccessionedResult = removeSolrDocFromIndex(determineDeaccessionedDatasetId(dataset));
1076 result.append(deleteDeaccessionedResult);
1077 List<String> docIds = findSolrDocIdsForFilesToDelete(dataset, IndexableDataset.DatasetState.DEACCESSIONED);
1078 String deleteFilesResult = removeMultipleSolrDocs(docIds);
1079 result.append(deleteFilesResult);
1080 return result.toString();
1081 }
1082
1083 private String removePublished(Dataset dataset) {
1084 StringBuilder result = new StringBuilder();
1085 String deletePublishedResult = removeSolrDocFromIndex(determinePublishedDatasetSolrDocId(dataset));
1086 result.append(deletePublishedResult);
1087 List<String> docIds = findSolrDocIdsForFilesToDelete(dataset, IndexableDataset.DatasetState.PUBLISHED);
1088 String deleteFilesResult = removeMultipleSolrDocs(docIds);
1089 result.append(deleteFilesResult);
1090 return result.toString();
1091 }
1092
1093 private Dataverse findRootDataverseCached() {
1094 if (true) {
1095 /**
1096 * @todo Is the code below working at all? We don't want the root
1097 * dataverse to be indexed into Solr. Specifically, we don't want a
1098 * dataverse "card" to show up while browsing.
1099 *
1100 * Let's just find the root dataverse and be done with it. We'll
1101 * figure out the caching later.
1102 */
1103 try {
1104 Dataverse rootDataverse = dataverseService.findRootDataverse();
1105 return rootDataverse;
1106 } catch (EJBException ex) {
1107 logger.info("caught " + ex);
1108 Throwable cause = ex.getCause();
1109 while (cause.getCause() != null) {
1110 logger.info("caused by... " + cause);
1111 cause = cause.getCause();
1112 }
1113 return null;
1114 }
1115 }
1116
1117 /**
1118 * @todo Why isn't this code working?
1119 */
1120 if (rootDataverseCached != null) {
1121 return rootDataverseCached;
1122 } else {
1123 rootDataverseCached = dataverseService.findRootDataverse();
1124 if (rootDataverseCached != null) {
1125 return rootDataverseCached;
1126 } else {
1127 throw new RuntimeException("unable to determine root dataverse");
1128 }
1129 }
1130 }
1131
1132 private String getDesiredCardState(Map<DatasetVersion.VersionState, Boolean> desiredCards) {
1133 /**
1134 * @todo make a JVM option to enforce sanity checks? Call it dev=true?
1135 */
1136 boolean sanityCheck = true;
1137 if (sanityCheck) {
1138 Set<DatasetVersion.VersionState> expected = new HashSet<>();
1139 expected.add(DatasetVersion.VersionState.DRAFT);
1140 expected.add(DatasetVersion.VersionState.RELEASED);
1141 expected.add(DatasetVersion.VersionState.DEACCESSIONED);
1142 if (!desiredCards.keySet().equals(expected)) {
1143 throw new RuntimeException("Mismatch between expected version states (" + expected + ") and version states passed in (" + desiredCards.keySet() + ")");
1144 }
1145 }
1146 return "Desired state for existence of cards: " + desiredCards + "\n";
1147 }
1148
1149 /**
1150 * @return Dataverses that should be reindexed either because they have
1151 * never been indexed or their index time is before their modification time.
1152 */
1153 public List findStaleOrMissingDataverses() {
1154 List<Dataverse> staleDataverses = new ArrayList<>();
1155 for (Dataverse dataverse : dataverseService.findAll()) {
1156 if (dataverse.equals(dataverseService.findRootDataverse())) {
1157 continue;
1158 }
1159 if (stale(dataverse)) {
1160 staleDataverses.add(dataverse);
1161 }
1162 }
1163 return staleDataverses;
1164 }
1165
1166 /**
1167 * @return Datasets that should be reindexed either because they have never
1168 * been indexed or their index time is before their modification time.
1169 */
1170 public List<Dataset> findStaleOrMissingDatasets() {
1171 List<Dataset> staleDatasets = new ArrayList<>();
1172 for (Dataset dataset : datasetService.findAll()) {
1173 if (stale(dataset)) {
1174 staleDatasets.add(dataset);
1175 }
1176 }
1177 return staleDatasets;
1178 }
1179
1180 private boolean stale(DvObject dvObject) {
1181 Timestamp indexTime = dvObject.getIndexTime();
1182 Timestamp modificationTime = dvObject.getModificationTime();
1183 if (indexTime == null) {
1184 return true;
1185 } else {
1186 if (indexTime.before(modificationTime)) {
1187 return true;
1188 }
1189 }
1190 return false;
1191 }
1192
1193 public List<Long> findDataversesInSolrOnly() throws SearchException {
1194 try {
1195 /**
1196 * @todo define this centrally and statically
1197 */
1198 return findDvObjectInSolrOnly("dataverses");
1199 } catch (SearchException ex) {
1200 throw ex;
1201 }
1202 }
1203
1204 public List<Long> findDatasetsInSolrOnly() throws SearchException {
1205 try {
1206 /**
1207 * @todo define this centrally and statically
1208 */
1209 return findDvObjectInSolrOnly("datasets");
1210 } catch (SearchException ex) {
1211 throw ex;
1212 }
1213 }
1214
1215 private List<Long> findDvObjectInSolrOnly(String type) throws SearchException {
1216 SolrServer solrServer = new HttpSolrServer("http://" + systemConfig.getSolrHostColonPort() + "/solr");
1217 SolrQuery solrQuery = new SolrQuery();
1218 solrQuery.setQuery("*");
1219 solrQuery.setRows(Integer.SIZE);
1220 solrQuery.addFilterQuery(SearchFields.TYPE + ":" + type);
1221 List<Long> dvObjectInSolrOnly = new ArrayList<>();
1222 QueryResponse queryResponse = null;
1223 try {
1224 queryResponse = solrServer.query(solrQuery);
1225 } catch (SolrServerException ex) {
1226 throw new SearchException("Error searching Solr for " + type, ex);
1227 }
1228 SolrDocumentList results = queryResponse.getResults();
1229 for (SolrDocument solrDocument : results) {
1230 Object idObject = solrDocument.getFieldValue(SearchFields.ENTITY_ID);
1231 if (idObject != null) {
1232 try {
1233 long id = (Long) idObject;
1234 DvObject dvobject = dvObjectService.findDvObject(id);
1235 if (dvobject == null) {
1236 dvObjectInSolrOnly.add(id);
1237 }
1238 } catch (ClassCastException ex) {
1239 throw new SearchException("Found " + SearchFields.ENTITY_ID + " but error casting " + idObject + " to long", ex);
1240 }
1241 }
1242 }
1243 return dvObjectInSolrOnly;
1244 }
1245
1246 }