branch develop updated (b25dc5e -> c4e9b3e)

codelutin.com scm

24 May 24 May

9:43 a.m.

New subject: 01/05: refs #9197 Utilisation de Apache Tika pour indexer les documents

This is an automated email from the git hooks/post-receive script. New commit to branch develop in repository coselmar. See https://gitlab.nuiton.org/codelutin/coselmar.git commit 8517e8f45186e4080ce56c7eb489d9592e2a4802 Author: Yannick Martel <martel@©odelutin.com> Date: Mon May 15 22:01:04 2017 +0200 refs #9197 Utilisation de Apache Tika pour indexer les documents --- coselmar-rest/pom.xml | 8 +- .../indexation/DocumentsIndexationService.java | 127 ++++++++++++++------- .../coselmar/services/indexation/LuceneUtils.java | 14 ++- .../coselmar/services/v1/AdminWebService.java | 2 +- .../coselmar/services/v1/DocumentsWebService.java | 38 ++++-- .../indexation/DocumentsIndexationServiceTest.java | 16 +-- pom.xml | 13 ++- 7 files changed, 146 insertions(+), 72 deletions(-) diff --git a/coselmar-rest/pom.xml b/coselmar-rest/pom.xml index bcf9b30..76b67c6 100644 --- a/coselmar-rest/pom.xml +++ b/coselmar-rest/pom.xml @@ -138,7 +138,7 @@ </dependency> <dependency> - <groupId>postgresql</groupId> + <groupId>org.postgresql</groupId> <artifactId>postgresql</artifactId> <scope>runtime</scope> </dependency> @@ -165,6 +165,12 @@ <artifactId>lucene-backward-codecs</artifactId> </dependency> +  + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parsers</artifactId> + </dependency> +  <dependency> <groupId>com.github.spullara.mustache.java</groupId> diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java index 16bca16..92402fb 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java @@ -24,6 +24,7 @@ package fr.ifremer.coselmar.services.indexation; * #L% */ +import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; @@ -32,6 +33,8 @@ import java.util.Set; import fr.ifremer.coselmar.beans.DocumentBean; import fr.ifremer.coselmar.services.CoselmarSimpleServiceSupport; import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; @@ -45,6 +48,7 @@ import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.WildcardQuery; +import org.apache.tika.exception.TikaException; /** * This Services provides operation about {@link fr.ifremer.coselmar.persistence.entity.Document} @@ -62,6 +66,8 @@ import org.apache.lucene.search.WildcardQuery; */ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { + private static final Log log = LogFactory.getLog(DocumentsIndexationService.class); + protected static final String DOCUMENT_ID_INDEX_PROPERTY = "documentId"; protected static final String DOCUMENT_NAME_INDEX_PROPERTY = "documentName"; protected static final String DOCUMENT_AUTHORS_INDEX_PROPERTY = "documentAuthors"; @@ -70,9 +76,10 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { protected static final String DOCUMENT_NAME_CLOUD_TAG_PROPERTY = "documentCloudTagName"; protected static final String DOCUMENT_SUMMARY_CLOUD_TAG_PROPERTY = "documentCloudTagSummary"; protected static final String DOCUMENT_KEYWORD_CLOUD_TAG_PROPERTY = "documentCloudTagKeyword"; + protected static final String DOCUMENT_FILE_CONTENT_INDEX_PROPERTY = "documentFileContent"; protected static final String DOCUMENT_TYPE = "documentindextype"; - public void indexDocument(DocumentBean document) throws IOException { + public void indexDocument(DocumentBean document, String filepath) throws IOException { Document doc = new Document(); doc.add(new StringField(DOCUMENT_ID_INDEX_PROPERTY, document.getId(), Field.Store.YES)); @@ -107,6 +114,20 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { } } + if (StringUtils.isNotBlank(filepath)) { + try { + File documentFile = new File(filepath); + String parsedDocumentFile = getLuceneUtils().getTika().parseToString(documentFile); + doc.add(new Field(DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, parsedDocumentFile, TextField.TYPE_STORED)); + } catch (TikaException te) { + if (log.isErrorEnabled()) { + String message = String.format("Unable to index document '%s'", filepath); + log.error(message); + } + } + + } + getLuceneUtils().getIndexWriter().addDocument(doc); getLuceneUtils().getIndexWriter().commit(); @@ -119,32 +140,35 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { String[] words = text.split(" "); // Parse a simple query that searches for the "text": - BooleanQuery query = new BooleanQuery(); + BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder(); - BooleanQuery nameQuery = new BooleanQuery(); - BooleanQuery summaryQuery = new BooleanQuery(); - BooleanQuery authorsQuery = new BooleanQuery(); + BooleanQuery.Builder nameQueryBuilder = new BooleanQuery.Builder(); + BooleanQuery.Builder summaryQueryBuilder = new BooleanQuery.Builder(); + BooleanQuery.Builder authorsQueryBuilder = new BooleanQuery.Builder(); + BooleanQuery.Builder documentFileQueryBuilder = new BooleanQuery.Builder(); for (String word : words) { String wildWord = String.format("*%s*", word.toLowerCase()); - nameQuery.add(new WildcardQuery(new Term(DOCUMENT_NAME_INDEX_PROPERTY, wildWord)), BooleanClause.Occur.MUST); - summaryQuery.add(new WildcardQuery(new Term(DOCUMENT_SUMMARY_INDEX_PROPERTY, wildWord)), BooleanClause.Occur.MUST); - authorsQuery.add(new WildcardQuery(new Term(DOCUMENT_AUTHORS_INDEX_PROPERTY, wildWord)), BooleanClause.Occur.MUST); + nameQueryBuilder.add(new WildcardQuery(new Term(DOCUMENT_NAME_INDEX_PROPERTY, wildWord)), BooleanClause.Occur.MUST); + summaryQueryBuilder.add(new WildcardQuery(new Term(DOCUMENT_SUMMARY_INDEX_PROPERTY, wildWord)), BooleanClause.Occur.MUST); + authorsQueryBuilder.add(new WildcardQuery(new Term(DOCUMENT_AUTHORS_INDEX_PROPERTY, wildWord)), BooleanClause.Occur.MUST); + documentFileQueryBuilder.add(new WildcardQuery(new Term(DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, wildWord)), BooleanClause.Occur.MUST); } - query.add(nameQuery, BooleanClause.Occur.SHOULD); - query.add(summaryQuery, BooleanClause.Occur.SHOULD); - query.add(authorsQuery, BooleanClause.Occur.SHOULD); + queryBuilder.add(nameQueryBuilder.build(), BooleanClause.Occur.SHOULD); + queryBuilder.add(summaryQueryBuilder.build(), BooleanClause.Occur.SHOULD); + queryBuilder.add(authorsQueryBuilder.build(), BooleanClause.Occur.SHOULD); + queryBuilder.add(documentFileQueryBuilder.build(), BooleanClause.Occur.SHOULD); - query.add(new TermQuery(new Term(DOCUMENT_KEYWORD_INDEX_PROPERTY, text.toLowerCase())), BooleanClause.Occur.SHOULD); + queryBuilder.add(new TermQuery(new Term(DOCUMENT_KEYWORD_INDEX_PROPERTY, text.toLowerCase())), BooleanClause.Occur.SHOULD); // Combine that with the type - BooleanQuery fullQuery = new BooleanQuery(); - fullQuery.add(query, BooleanClause.Occur.MUST); - fullQuery.add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST); + BooleanQuery.Builder fullQueryBuilder = new BooleanQuery.Builder(); + fullQueryBuilder.add(queryBuilder.build(), BooleanClause.Occur.MUST); + fullQueryBuilder.add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST); - ScoreDoc[] hits = isearcher.search(fullQuery, null, 1000).scoreDocs; + ScoreDoc[] hits = isearcher.search(fullQueryBuilder.build(), 1000).scoreDocs; List<String> documentIds = new ArrayList(hits.length); @@ -163,42 +187,45 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { IndexSearcher isearcher = new IndexSearcher(ireader); - BooleanQuery keywordsQuery = new BooleanQuery(); + BooleanQuery.Builder keywordsQueryBuilder = new BooleanQuery.Builder(); for (String text : texts) { String[] words = text.split(" "); // Parse a simple query that searches for the "text": - BooleanQuery query = new BooleanQuery(); + BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder(); - BooleanQuery nameQuery = new BooleanQuery(); - BooleanQuery summaryQuery = new BooleanQuery(); - BooleanQuery authorsQuery = new BooleanQuery(); + BooleanQuery.Builder nameQueryBuilder = new BooleanQuery.Builder(); + BooleanQuery.Builder summaryQueryBuilder = new BooleanQuery.Builder(); + BooleanQuery.Builder authorsQueryBuilder = new BooleanQuery.Builder(); + BooleanQuery.Builder documentFileQueryBuilder = new BooleanQuery.Builder(); for (String word : words) { String wildWord = "*" + word.toLowerCase() + "*"; - nameQuery.add(new WildcardQuery(new Term(DOCUMENT_NAME_INDEX_PROPERTY, wildWord)), BooleanClause.Occur.MUST); - summaryQuery.add(new WildcardQuery(new Term(DOCUMENT_SUMMARY_INDEX_PROPERTY, wildWord)), BooleanClause.Occur.MUST); - authorsQuery.add(new WildcardQuery(new Term(DOCUMENT_AUTHORS_INDEX_PROPERTY, wildWord)), BooleanClause.Occur.MUST); + nameQueryBuilder.add(new WildcardQuery(new Term(DOCUMENT_NAME_INDEX_PROPERTY, wildWord)), BooleanClause.Occur.MUST); + summaryQueryBuilder.add(new WildcardQuery(new Term(DOCUMENT_SUMMARY_INDEX_PROPERTY, wildWord)), BooleanClause.Occur.MUST); + authorsQueryBuilder.add(new WildcardQuery(new Term(DOCUMENT_AUTHORS_INDEX_PROPERTY, wildWord)), BooleanClause.Occur.MUST); + documentFileQueryBuilder.add(new WildcardQuery(new Term(DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, wildWord)), BooleanClause.Occur.MUST); } - query.add(nameQuery, BooleanClause.Occur.SHOULD); - query.add(summaryQuery, BooleanClause.Occur.SHOULD); - query.add(authorsQuery, BooleanClause.Occur.SHOULD); + queryBuilder.add(nameQueryBuilder.build(), BooleanClause.Occur.SHOULD); + queryBuilder.add(summaryQueryBuilder.build(), BooleanClause.Occur.SHOULD); + queryBuilder.add(authorsQueryBuilder.build(), BooleanClause.Occur.SHOULD); + queryBuilder.add(documentFileQueryBuilder.build(), BooleanClause.Occur.SHOULD); - query.add(new TermQuery(new Term(DOCUMENT_KEYWORD_INDEX_PROPERTY, text.toLowerCase())), BooleanClause.Occur.SHOULD); + queryBuilder.add(new TermQuery(new Term(DOCUMENT_KEYWORD_INDEX_PROPERTY, text.toLowerCase())), BooleanClause.Occur.SHOULD); // Combine that with the type //XXX ymartel : put to Occur.SHOULD to make an "OR" - keywordsQuery.add(query, BooleanClause.Occur.MUST); + keywordsQueryBuilder.add(queryBuilder.build(), BooleanClause.Occur.MUST); } - BooleanQuery fullQuery = new BooleanQuery(); - fullQuery.add(keywordsQuery, BooleanClause.Occur.MUST); - fullQuery.add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST); + BooleanQuery.Builder fullQueryBuilder = new BooleanQuery.Builder(); + fullQueryBuilder.add(keywordsQueryBuilder.build(), BooleanClause.Occur.MUST); + fullQueryBuilder.add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST); - ScoreDoc[] hits = isearcher.search(fullQuery, null, 1000).scoreDocs; + ScoreDoc[] hits = isearcher.search(fullQueryBuilder.build(), 1000).scoreDocs; List<String> documentIds = new ArrayList(hits.length); @@ -212,16 +239,16 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { return documentIds; } - public void updateDocument(DocumentBean document) throws IOException { + public void updateDocument(DocumentBean document, String filepath) throws IOException { DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter(), false); IndexSearcher isearcher = new IndexSearcher(ireader); // Retrieve document - BooleanQuery query = new BooleanQuery(); - query.add(new TermQuery(new Term(DOCUMENT_ID_INDEX_PROPERTY, document.getId())), BooleanClause.Occur.MUST); - query.add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST); + BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder(); + queryBuilder.add(new TermQuery(new Term(DOCUMENT_ID_INDEX_PROPERTY, document.getId())), BooleanClause.Occur.MUST); + queryBuilder.add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST); - ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs; + ScoreDoc[] hits = isearcher.search(queryBuilder.build(), 1000).scoreDocs; if (hits.length > 0) { Document doc = new Document(); doc.add(new StringField(DOCUMENT_ID_INDEX_PROPERTY, document.getId(), Field.Store.YES)); @@ -249,13 +276,27 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { // Cloud Tag management if (keyword.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { - doc.add(new Field(DOCUMENT_KEYWORD_CLOUD_TAG_PROPERTY, keyword.replaceAll("'", " "), TextField.TYPE_STORED)); + doc.add(new TextField(DOCUMENT_KEYWORD_CLOUD_TAG_PROPERTY, keyword.replaceAll("'", " "), Field.Store.YES)); } } } doc.add(new Field("type", DOCUMENT_TYPE, TextField.TYPE_STORED)); + if (StringUtils.isNotBlank(filepath)) { + try { + File documentFile = new File(filepath); + String parsedDocumentFile = getLuceneUtils().getTika().parseToString(documentFile); + doc.add(new Field(DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, parsedDocumentFile, TextField.TYPE_STORED)); + } catch (TikaException te) { + if (log.isErrorEnabled()) { + String message = String.format("Unable to index document '%s'", filepath); + log.error(message); + } + } + + } + getLuceneUtils().getIndexWriter().updateDocument(new Term(DOCUMENT_ID_INDEX_PROPERTY, document.getId()), doc); getLuceneUtils().getIndexWriter().commit(); } @@ -266,11 +307,11 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { public void deleteDocument(String documentId) throws IOException { // Retrieve document - BooleanQuery query = new BooleanQuery(); - query.add(new TermQuery(new Term(DOCUMENT_ID_INDEX_PROPERTY, documentId)), BooleanClause.Occur.MUST); - query.add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST); + BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder(); + queryBuilder.add(new TermQuery(new Term(DOCUMENT_ID_INDEX_PROPERTY, documentId)), BooleanClause.Occur.MUST); + queryBuilder.add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST); - getLuceneUtils().getIndexWriter().deleteDocuments(query); + getLuceneUtils().getIndexWriter().deleteDocuments(queryBuilder.build()); getLuceneUtils().getIndexWriter().commit(); } diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/LuceneUtils.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/LuceneUtils.java index b6c0736..43a3c43 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/LuceneUtils.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/LuceneUtils.java @@ -26,22 +26,18 @@ package fr.ifremer.coselmar.services.indexation; import java.io.File; import java.io.IOException; -import java.io.InputStreamReader; import fr.ifremer.coselmar.config.CoselmarServicesConfig; -import org.apache.commons.io.IOUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.core.SimpleAnalyzer; import org.apache.lucene.analysis.en.EnglishAnalyzer; -import org.apache.lucene.analysis.fr.FrenchAnalyzer; -import org.apache.lucene.analysis.snowball.SnowballFilter; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.NIOFSDirectory; +import org.apache.tika.Tika; /** * @author ymartel <martel@codelutin.com> @@ -53,6 +49,7 @@ public class LuceneUtils { public Analyzer analyzer; public final IndexWriterConfig indexationConfig = new IndexWriterConfig(getAnalyzer()); public IndexWriter indexWriter; + protected Tika tika; protected CoselmarServicesConfig servicesConfig; @@ -81,6 +78,13 @@ public class LuceneUtils { return indexWriter; } + public Tika getTika() { + if (tika == null) { + this.tika = new Tika(); + } + return tika; + } + public void closeWriter() { if (indexWriter != null) { try { diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/AdminWebService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/AdminWebService.java index 675b49a..33b3e57 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/AdminWebService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/AdminWebService.java @@ -78,7 +78,7 @@ public class AdminWebService extends CoselmarWebServiceSupport { for (Document document : documents) { String lightId = getPersistenceContext().getTopiaIdFactory().getRandomPart(document.getTopiaId()); DocumentBean documentBean = BeanEntityConverter.toBean(getPersistenceContext().getTopiaIdFactory(), document); - documentsIndexationService.indexDocument(documentBean); + documentsIndexationService.indexDocument(documentBean, document.getFilePath()); } // Get all questions diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/DocumentsWebService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/DocumentsWebService.java index 61cf842..9a1d010 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/DocumentsWebService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/DocumentsWebService.java @@ -445,7 +445,7 @@ public class DocumentsWebService extends CoselmarWebServiceSupport { DocumentsIndexationService documentsIndexationService = getServicesContext().newService(DocumentsIndexationService.class); try { - documentsIndexationService.indexDocument(result); + documentsIndexationService.indexDocument(result, filePath); if (log.isDebugEnabled()) { String message = String.format("Document '%s' added to index", documentName); log.debug(message); @@ -471,11 +471,11 @@ public class DocumentsWebService extends CoselmarWebServiceSupport { CoselmarUser currentUser = checkUserAuthentication(authorization); String documentFullId = getFullIdFromShort(Document.class, documentId); - Document document = getDocumentDao().forTopiaIdEquals(documentFullId).findAny(); + Document documentEntity = getDocumentDao().forTopiaIdEquals(documentFullId).findAny(); // Only Owner Expert or Supervisor/Admin can add document file if (!DOCUMENT_SUPER_USER_ROLES.contains(currentUser.getRole().name()) - && document.getOwner() != currentUser) { + && documentEntity.getOwner() != currentUser) { String message = String.format("User %s %s ('%s') is not allowed to add document file", currentUser.getFirstname(), currentUser.getName(), getShortIdFromFull(currentUser.getTopiaId())); if (log.isWarnEnabled()) { @@ -485,21 +485,37 @@ public class DocumentsWebService extends CoselmarWebServiceSupport { } // Get owner to place correctly the file - CoselmarUser owner = document.getOwner(); + CoselmarUser owner = documentEntity.getOwner(); Pair<String, String> pathAndContentType = managerDocumentFile(uploadFile, owner); String filePath = pathAndContentType.getLeft(); String contentType = pathAndContentType.getRight(); // If document has already a file, remove it - if (StringUtils.isNotBlank(document.getFilePath())) { - File documentFile = new File(document.getFilePath()); + if (StringUtils.isNotBlank(documentEntity.getFilePath())) { + File documentFile = new File(documentEntity.getFilePath()); FileUtils.deleteQuietly(documentFile); } - document.setWithFile(true); - document.setMimeType(contentType); - document.setFilePath(filePath); - document.setFileName(uploadFile.getName()); + documentEntity.setWithFile(true); + documentEntity.setMimeType(contentType); + documentEntity.setFilePath(filePath); + documentEntity.setFileName(uploadFile.getName()); + + // Should update document index information to put the file + DocumentBean documentBean = BeanEntityConverter.toBean(getPersistenceContext().getTopiaIdFactory(), documentEntity); + + DocumentsIndexationService documentsIndexationService = getServicesContext().newService(DocumentsIndexationService.class); + try { + documentsIndexationService.updateDocument(documentBean, filePath); // no document file for the moment here + if (log.isDebugEnabled()) { + String message = String.format("Document '%s' was updated in index", documentEntity.getName()); + log.debug(message); + } + } catch (IOException e) { + if (log.isErrorEnabled()) { + log.error("Unable to update document index information", e); + } + } commit(); @@ -654,7 +670,7 @@ public class DocumentsWebService extends CoselmarWebServiceSupport { DocumentsIndexationService documentsIndexationService = getServicesContext().newService(DocumentsIndexationService.class); try { - documentsIndexationService.indexDocument(result); + documentsIndexationService.indexDocument(result, null); // no document file for the moment here if (log.isDebugEnabled()) { String message = String.format("Document '%s' was updated in index", document.getName()); log.debug(message); diff --git a/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationServiceTest.java b/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationServiceTest.java index 9f74687..acd53aa 100644 --- a/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationServiceTest.java +++ b/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationServiceTest.java @@ -67,7 +67,7 @@ public class DocumentsIndexationServiceTest extends AbstractCoselmarServiceTest "This is not a fake document used for test", "fr", null, "Jack, Jane", null, null, false, null, "http://somewhere", "no comment", null, null); - documentsIndexationService.indexDocument(documentOne); + documentsIndexationService.indexDocument(documentOne, null); } @@ -124,7 +124,7 @@ public class DocumentsIndexationServiceTest extends AbstractCoselmarServiceTest "This is not a fake document used for test", "fr", null, "Jack, Jane", null, null, false, null, "http://somewhere", "no comment", null, null); - documentsIndexationService.indexDocument(documentOne); + documentsIndexationService.indexDocument(documentOne, null); List<String> documentMatchingDocumentIds = documentsIndexationService.searchDocuments("document"); Assert.assertEquals(1, documentMatchingDocumentIds.size()); @@ -149,7 +149,7 @@ public class DocumentsIndexationServiceTest extends AbstractCoselmarServiceTest "This is a faked doct updated for test", "fr", null, "James, JJ", null, null, false, null, "http://somewhere", "no comment", null, null); - documentsIndexationService.updateDocument(documentOne); + documentsIndexationService.updateDocument(documentOne, null); documentMatchingDocumentIds = documentsIndexationService.searchDocuments("document"); Assert.assertTrue(documentMatchingDocumentIds.isEmpty()); @@ -222,7 +222,7 @@ public class DocumentsIndexationServiceTest extends AbstractCoselmarServiceTest "This is not a fake document used for test", "fr", null, "Jack, Jane", null, null, false, null, "http://somewhere", "no comment", null, null); - documentsIndexationService.indexDocument(documentOne); + documentsIndexationService.indexDocument(documentOne, null); String documentTwoId = "testSearchMultiple_document2"; @@ -231,7 +231,7 @@ public class DocumentsIndexationServiceTest extends AbstractCoselmarServiceTest new Date(), Lists.newArrayList("tardis", "documentation", "old", "new", "borrowed", "blue"), "testDocument", "This is part of document about the TARDIS", "fr", null, "The Doctor, Rose, Amy, River, Clara", null, null, false, null, "http://tardis.wikia.com/wiki/TARDIS", "no comment", null, null); - documentsIndexationService.indexDocument(documentTwo); + documentsIndexationService.indexDocument(documentTwo, null); List<String> documentMatchingDoctorIds = documentsIndexationService.searchDocuments(Arrays.asList("doctor")); @@ -266,7 +266,7 @@ public class DocumentsIndexationServiceTest extends AbstractCoselmarServiceTest "This is not a fake document used for test", "fr", null, "Jack, Jane", null, null, false, null, "http://somewhere", "no comment", null, null); - documentsIndexationService.indexDocument(documentOne); + documentsIndexationService.indexDocument(documentOne, null); DocumentBean documentTwo = new DocumentBean("document2", "Another document", "Amy Pond", "user002", Privacy.PUBLIC.name(), @@ -274,7 +274,7 @@ public class DocumentsIndexationServiceTest extends AbstractCoselmarServiceTest "This is just an other document used for test", "fr", null, "Amy, Rory", null, null, false, null, "http://somewhere", "no comment", null, null); - documentsIndexationService.indexDocument(documentTwo); + documentsIndexationService.indexDocument(documentTwo, null); DocumentBean documentThree = new DocumentBean("document3", "Tardis documentation", "The Doctor", "user003", Privacy.PUBLIC.name(), @@ -282,7 +282,7 @@ public class DocumentsIndexationServiceTest extends AbstractCoselmarServiceTest "This is part of documentation about the TARDIS", "fr", null, "The Doctor, Rose, Amy, River, Clara", null, null, false, null, "http://tardis.wikia.com/wiki/TARDIS", "no comment", null, null); - documentsIndexationService.indexDocument(documentThree); + documentsIndexationService.indexDocument(documentThree, null); } diff --git a/pom.xml b/pom.xml index 2c1ad62..f4bd15e 100644 --- a/pom.xml +++ b/pom.xml @@ -134,11 +134,12 @@ <nuitonValidatorVersion>3.0</nuitonValidatorVersion> <nuitonConvertorVersion>1.0</nuitonConvertorVersion> - <hibernateVersion>4.3.8.Final</hibernateVersion> - <postgresqlVersion>9.1-901-1.jdbc4</postgresqlVersion> + <hibernateVersion>4.3.11.Final</hibernateVersion> + <postgresqlVersion>9.4.1212.jre7</postgresqlVersion> <h2Version>1.4.190</h2Version> <luceneVersion>5.4.0</luceneVersion> + <tikaVersion>1.14</tikaVersion> <tomcatEmbedVersion>7.0.50</tomcatEmbedVersion> @@ -279,7 +280,7 @@ </dependency> <dependency> - <groupId>postgresql</groupId> + <groupId>org.postgresql</groupId> <artifactId>postgresql</artifactId> <version>${postgresqlVersion}</version> </dependency> @@ -328,6 +329,12 @@ <scope>runtime</scope> </dependency> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parsers</artifactId> + <version>${tikaVersion}</version> + </dependency> +  <dependency> <groupId>org.apache.commons</groupId> -- To stop receiving notification emails like this one, please contact codelutin.com SCM administrator <admin+scm@codelutin.com>.

Reply

Sign in to reply online Use email software

codelutin.com scm

9:43 a.m.

New subject: 03/05: refs #9197 use vectors on document fields indexation

This is an automated email from the git hooks/post-receive script. New commit to branch develop in repository coselmar. See https://gitlab.nuiton.org/codelutin/coselmar.git commit c616ba1fb803d57b19b2c1dd0c5f1e0e62108b2b Author: Yannick Martel <martel@©odelutin.com> Date: Mon May 22 16:08:11 2017 +0200 refs #9197 use vectors on document fields indexation --- .../indexation/DocumentsIndexationService.java | 20 ++++++++--------- .../coselmar/services/indexation/LuceneUtils.java | 15 +++++++++++++ .../indexation/QuestionsIndexationService.java | 25 ++++++---------------- 3 files changed, 31 insertions(+), 29 deletions(-) diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java index 87c4b68..e218577 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java @@ -87,16 +87,16 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { String documentName = document.getName(); String documentSummary = document.getSummary(); - doc.add(new TextField(DOCUMENT_NAME_INDEX_PROPERTY, documentName, Field.Store.YES)); + doc.add(new Field(DOCUMENT_NAME_INDEX_PROPERTY, documentName, LuceneUtils.TYPE_STORED)); if (StringUtils.isNotBlank(document.getAuthors())) { doc.add(new TextField(DOCUMENT_AUTHORS_INDEX_PROPERTY, document.getAuthors(), Field.Store.YES)); } - doc.add(new TextField(DOCUMENT_SUMMARY_INDEX_PROPERTY, documentSummary, Field.Store.YES)); + doc.add(new Field(DOCUMENT_SUMMARY_INDEX_PROPERTY, documentSummary, LuceneUtils.TYPE_STORED)); doc.add(new Field("type", DOCUMENT_TYPE, TextField.TYPE_STORED)); // Cloud Tag management if (documentName.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { - doc.add(new TextField(DOCUMENT_NAME_CLOUD_TAG_PROPERTY, documentName.replaceAll("'", " "), Field.Store.YES)); + doc.add(new Field(DOCUMENT_NAME_CLOUD_TAG_PROPERTY, documentName.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } // if (documentSummary.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { // doc.add(new TextField(DOCUMENT_SUMMARY_CLOUD_TAG_PROPERTY, documentSummary, Field.Store.YES)); @@ -109,7 +109,7 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { // Cloud Tag management if (keyword.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { - doc.add(new Field(DOCUMENT_KEYWORD_CLOUD_TAG_PROPERTY, keyword.replaceAll("'", " "), TextField.TYPE_STORED)); + doc.add(new Field(DOCUMENT_KEYWORD_CLOUD_TAG_PROPERTY, keyword.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } } } @@ -118,7 +118,7 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { try { File documentFile = new File(filepath); String parsedDocumentFile = getLuceneUtils().getTika().parseToString(documentFile); - doc.add(new Field(DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, parsedDocumentFile, TextField.TYPE_STORED)); + doc.add(new Field(DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, parsedDocumentFile, LuceneUtils.TYPE_STORED)); } catch (TikaException te) { if (log.isErrorEnabled()) { String message = String.format("Unable to index document '%s'", filepath); @@ -255,15 +255,15 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { String documentName = document.getName(); String documentSummary = document.getSummary(); - doc.add(new TextField(DOCUMENT_NAME_INDEX_PROPERTY, documentName, Field.Store.YES)); + doc.add(new Field(DOCUMENT_NAME_INDEX_PROPERTY, documentName, LuceneUtils.TYPE_STORED)); if (StringUtils.isNotBlank(document.getAuthors())) { doc.add(new TextField(DOCUMENT_AUTHORS_INDEX_PROPERTY, document.getAuthors(), Field.Store.YES)); } - doc.add(new TextField(DOCUMENT_SUMMARY_INDEX_PROPERTY, documentSummary, Field.Store.YES)); + doc.add(new Field(DOCUMENT_SUMMARY_INDEX_PROPERTY, documentSummary, LuceneUtils.TYPE_STORED)); // Cloud Tag management if (documentName.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { - doc.add(new TextField(DOCUMENT_NAME_CLOUD_TAG_PROPERTY, documentName.replaceAll("'", " "), Field.Store.YES)); + doc.add(new Field(DOCUMENT_NAME_CLOUD_TAG_PROPERTY, documentName.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } // if (documentSummary.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { // doc.add(new TextField(DOCUMENT_SUMMARY_CLOUD_TAG_PROPERTY, documentSummary, Field.Store.YES)); @@ -276,7 +276,7 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { // Cloud Tag management if (keyword.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { - doc.add(new TextField(DOCUMENT_KEYWORD_CLOUD_TAG_PROPERTY, keyword.replaceAll("'", " "), Field.Store.YES)); + doc.add(new Field(DOCUMENT_KEYWORD_CLOUD_TAG_PROPERTY, keyword.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } } } @@ -288,7 +288,7 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { File documentFile = new File(filepath); String parsedDocumentFile = getLuceneUtils().getTika().parseToString(documentFile); if (StringUtils.isNotBlank(parsedDocumentFile)) { - doc.add(new Field(DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, parsedDocumentFile, TextField.TYPE_STORED)); + doc.add(new Field(DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, parsedDocumentFile, LuceneUtils.TYPE_STORED)); } } catch (TikaException te) { if (log.isErrorEnabled()) { diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/LuceneUtils.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/LuceneUtils.java index 43a3c43..26224cf 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/LuceneUtils.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/LuceneUtils.java @@ -33,6 +33,8 @@ import org.apache.commons.logging.LogFactory; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; @@ -51,6 +53,19 @@ public class LuceneUtils { public IndexWriter indexWriter; protected Tika tika; + public static final FieldType TYPE_STORED = new FieldType(); + static { + TYPE_STORED.setOmitNorms(true); + TYPE_STORED.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + TYPE_STORED.setStored(true); + TYPE_STORED.setStoreTermVectors(true); + TYPE_STORED.setStoreTermVectorPositions(true); + TYPE_STORED.setStoreTermVectorOffsets(true); + TYPE_STORED.setStoreTermVectorPayloads(true); + TYPE_STORED.setTokenized(true); + TYPE_STORED.freeze(); + } + protected CoselmarServicesConfig servicesConfig; public LuceneUtils(CoselmarServicesConfig servicesConfig) { diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java index b5e92d3..f60ce70 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java @@ -86,19 +86,6 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { protected static final String QUESTION_THEME_CLOUD_TAG_PROPERTY = "questionCloudTagTheme"; protected static final String DOCUMENT_TYPE = "questionindextype"; - public static final FieldType TYPE_STORED = new FieldType(); - static { - TYPE_STORED.setOmitNorms(true); - TYPE_STORED.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - TYPE_STORED.setStored(true); - TYPE_STORED.setStoreTermVectors(true); - TYPE_STORED.setStoreTermVectorPositions(true); - TYPE_STORED.setStoreTermVectorOffsets(true); - TYPE_STORED.setStoreTermVectorPayloads(true); - TYPE_STORED.setTokenized(true); - TYPE_STORED.freeze(); - } - public void indexQuestion(QuestionBean question) throws IOException { // First : try to find if already exist to update it @@ -120,11 +107,11 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { doc.add(new StringField(QUESTION_ID_INDEX_PROPERTY, question.getId(), Field.Store.YES)); doc.add(new TextField(QUESTION_TITLE_INDEX_PROPERTY, questionTitle, Field.Store.YES)); - doc.add(new Field(QUESTION_SUMMARY_INDEX_PROPERTY, questionSummary, TYPE_STORED)); + doc.add(new Field(QUESTION_SUMMARY_INDEX_PROPERTY, questionSummary, LuceneUtils.TYPE_STORED)); // Cloud Tag management if (questionTitle.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { - doc.add(new Field(QUESTION_TITLE_CLOUD_TAG_PROPERTY, questionTitle.replaceAll("'", " "), TYPE_STORED)); + doc.add(new Field(QUESTION_TITLE_CLOUD_TAG_PROPERTY, questionTitle.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } // if (questionSummary.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { // doc.add(new TextField(QUESTION_SUMMARY_CLOUD_TAG_PROPERTY, questionSummary, Field.Store.YES)); @@ -137,7 +124,7 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { // Cloud Tag management if (theme.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { - doc.add(new Field(QUESTION_THEME_CLOUD_TAG_PROPERTY, theme.replaceAll("'", " "), TYPE_STORED)); + doc.add(new Field(QUESTION_THEME_CLOUD_TAG_PROPERTY, theme.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } } } @@ -157,14 +144,14 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { doc.add(new StringField(QUESTION_ID_INDEX_PROPERTY, question.getId(), Field.Store.YES)); doc.add(new TextField(QUESTION_TITLE_INDEX_PROPERTY, questionTitle, Field.Store.YES)); - doc.add(new Field(QUESTION_SUMMARY_INDEX_PROPERTY, questionSummary, TYPE_STORED)); + doc.add(new Field(QUESTION_SUMMARY_INDEX_PROPERTY, questionSummary, LuceneUtils.TYPE_STORED)); doc.add(new TextField(QUESTION_STATUS_INDEX_PROPERTY, question.getStatus(), Field.Store.YES)); doc.add(new TextField(QUESTION_PRIVACY_INDEX_PROPERTY, question.getPrivacy(), Field.Store.YES)); // Cloud Tag management if (questionTitle.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { - doc.add(new Field(QUESTION_TITLE_CLOUD_TAG_PROPERTY, questionTitle.replaceAll("'", " "), TYPE_STORED)); + doc.add(new Field(QUESTION_TITLE_CLOUD_TAG_PROPERTY, questionTitle.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } // if (questionSummary.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { // doc.add(new TextField(QUESTION_SUMMARY_CLOUD_TAG_PROPERTY, questionSummary, Field.Store.YES)); @@ -177,7 +164,7 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { // Cloud Tag management if (theme.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { - doc.add(new Field(QUESTION_THEME_CLOUD_TAG_PROPERTY, theme.replaceAll("'", " "), TYPE_STORED)); + doc.add(new Field(QUESTION_THEME_CLOUD_TAG_PROPERTY, theme.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } } } -- To stop receiving notification emails like this one, please contact codelutin.com SCM administrator <admin+scm@codelutin.com>.

Reply

Sign in to reply online Use email software

codelutin.com scm

9:43 a.m.

New subject: 05/05: Merge branch 'feature/9197-Indexation_documents' into develop

This is an automated email from the git hooks/post-receive script. New commit to branch develop in repository coselmar. See https://gitlab.nuiton.org/codelutin/coselmar.git commit c4e9b3e7d51a79043dfb935091ba4d0f6550dbc1 Merge: b25dc5e 1e43d56 Author: Yannick Martel <martel@©odelutin.com> Date: Wed May 24 11:04:54 2017 +0200 Merge branch 'feature/9197-Indexation_documents' into develop coselmar-rest/pom.xml | 8 +- .../indexation/DocumentsIndexationService.java | 215 +++++++++++++++------ .../coselmar/services/indexation/LuceneUtils.java | 29 ++- .../indexation/QuestionsIndexationService.java | 76 ++++---- .../indexation/TransverseIndexationService.java | 6 +- .../coselmar/services/v1/AdminWebService.java | 2 +- .../coselmar/services/v1/DocumentsWebService.java | 38 ++-- .../coselmar/services/v1/QuestionsWebService.java | 85 +++++--- .../indexation/DocumentsIndexationServiceTest.java | 16 +- .../indexation/QuestionsIndexationServiceTest.java | 22 +-- pom.xml | 15 +- 11 files changed, 354 insertions(+), 158 deletions(-) -- To stop receiving notification emails like this one, please contact codelutin.com SCM administrator <admin+scm@codelutin.com>.

Reply

Sign in to reply online Use email software

codelutin.com scm

codelutin.com scm

codelutin.com scm

codelutin.com scm

codelutin.com scm

codelutin.com scm

tags

participants (1)