branch develop updated (b25dc5e -> c4e9b3e)
This is an automated email from the git hooks/post-receive script. New change to branch develop in repository coselmar. See https://gitlab.nuiton.org/codelutin/coselmar.git from b25dc5e [jgitflow-maven-plugin]Updating develop poms back to pre merge state new 8517e8f refs #9197 Utilisation de Apache Tika pour indexer les documents new f90edbe Upgrade lucene + revue de l'indexation des données de Question new c616ba1 refs #9197 use vectors on document fields indexation new 1e43d56 refs #9197 use lucene to make cloud tag on question page : data from question and its documents including file content new c4e9b3e Merge branch 'feature/9197-Indexation_documents' into develop The 5 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "adds" were already present in the repository and have only been added to this reference. Detailed log of new commits: commit c4e9b3e7d51a79043dfb935091ba4d0f6550dbc1 Merge: b25dc5e 1e43d56 Author: Yannick Martel <martel@©odelutin.com> Date: Wed May 24 11:04:54 2017 +0200 Merge branch 'feature/9197-Indexation_documents' into develop commit 1e43d560da4e23e5d45163f59ae9f9573e936ab5 Author: Yannick Martel <martel@©odelutin.com> Date: Mon May 22 18:05:10 2017 +0200 refs #9197 use lucene to make cloud tag on question page : data from question and its documents including file content commit c616ba1fb803d57b19b2c1dd0c5f1e0e62108b2b Author: Yannick Martel <martel@©odelutin.com> Date: Mon May 22 16:08:11 2017 +0200 refs #9197 use vectors on document fields indexation commit f90edbef1de138249656cf0f778863fe2ffeb654 Author: Yannick Martel <martel@©odelutin.com> Date: Mon May 22 11:16:32 2017 +0200 Upgrade lucene + revue de l'indexation des données de Question commit 8517e8f45186e4080ce56c7eb489d9592e2a4802 Author: Yannick Martel <martel@©odelutin.com> Date: Mon May 15 22:01:04 2017 +0200 refs #9197 Utilisation de Apache Tika pour indexer les documents Summary of changes: coselmar-rest/pom.xml | 8 +- .../indexation/DocumentsIndexationService.java | 215 +++++++++++++++------ .../coselmar/services/indexation/LuceneUtils.java | 29 ++- .../indexation/QuestionsIndexationService.java | 76 ++++---- .../indexation/TransverseIndexationService.java | 6 +- .../coselmar/services/v1/AdminWebService.java | 2 +- .../coselmar/services/v1/DocumentsWebService.java | 38 ++-- .../coselmar/services/v1/QuestionsWebService.java | 85 +++++--- .../indexation/DocumentsIndexationServiceTest.java | 16 +- .../indexation/QuestionsIndexationServiceTest.java | 22 +-- pom.xml | 15 +- 11 files changed, 354 insertions(+), 158 deletions(-) -- To stop receiving notification emails like this one, please contact codelutin.com SCM administrator <admin+scm@codelutin.com>.
This is an automated email from the git hooks/post-receive script. New commit to branch develop in repository coselmar. See https://gitlab.nuiton.org/codelutin/coselmar.git commit 8517e8f45186e4080ce56c7eb489d9592e2a4802 Author: Yannick Martel <martel@©odelutin.com> Date: Mon May 15 22:01:04 2017 +0200 refs #9197 Utilisation de Apache Tika pour indexer les documents --- coselmar-rest/pom.xml | 8 +- .../indexation/DocumentsIndexationService.java | 127 ++++++++++++++------- .../coselmar/services/indexation/LuceneUtils.java | 14 ++- .../coselmar/services/v1/AdminWebService.java | 2 +- .../coselmar/services/v1/DocumentsWebService.java | 38 ++++-- .../indexation/DocumentsIndexationServiceTest.java | 16 +-- pom.xml | 13 ++- 7 files changed, 146 insertions(+), 72 deletions(-) diff --git a/coselmar-rest/pom.xml b/coselmar-rest/pom.xml index bcf9b30..76b67c6 100644 --- a/coselmar-rest/pom.xml +++ b/coselmar-rest/pom.xml @@ -138,7 +138,7 @@ </dependency> <dependency> - <groupId>postgresql</groupId> + <groupId>org.postgresql</groupId> <artifactId>postgresql</artifactId> <scope>runtime</scope> </dependency> @@ -165,6 +165,12 @@ <artifactId>lucene-backward-codecs</artifactId> </dependency> + <!-- Tika for document indexation --> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parsers</artifactId> + </dependency> + <!-- Others --> <dependency> <groupId>com.github.spullara.mustache.java</groupId> diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java index 16bca16..92402fb 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java @@ -24,6 +24,7 @@ package fr.ifremer.coselmar.services.indexation; * #L% */ +import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; @@ -32,6 +33,8 @@ import java.util.Set; import fr.ifremer.coselmar.beans.DocumentBean; import fr.ifremer.coselmar.services.CoselmarSimpleServiceSupport; import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; @@ -45,6 +48,7 @@ import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.WildcardQuery; +import org.apache.tika.exception.TikaException; /** * This Services provides operation about {@link fr.ifremer.coselmar.persistence.entity.Document} @@ -62,6 +66,8 @@ import org.apache.lucene.search.WildcardQuery; */ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { + private static final Log log = LogFactory.getLog(DocumentsIndexationService.class); + protected static final String DOCUMENT_ID_INDEX_PROPERTY = "documentId"; protected static final String DOCUMENT_NAME_INDEX_PROPERTY = "documentName"; protected static final String DOCUMENT_AUTHORS_INDEX_PROPERTY = "documentAuthors"; @@ -70,9 +76,10 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { protected static final String DOCUMENT_NAME_CLOUD_TAG_PROPERTY = "documentCloudTagName"; protected static final String DOCUMENT_SUMMARY_CLOUD_TAG_PROPERTY = "documentCloudTagSummary"; protected static final String DOCUMENT_KEYWORD_CLOUD_TAG_PROPERTY = "documentCloudTagKeyword"; + protected static final String DOCUMENT_FILE_CONTENT_INDEX_PROPERTY = "documentFileContent"; protected static final String DOCUMENT_TYPE = "documentindextype"; - public void indexDocument(DocumentBean document) throws IOException { + public void indexDocument(DocumentBean document, String filepath) throws IOException { Document doc = new Document(); doc.add(new StringField(DOCUMENT_ID_INDEX_PROPERTY, document.getId(), Field.Store.YES)); @@ -107,6 +114,20 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { } } + if (StringUtils.isNotBlank(filepath)) { + try { + File documentFile = new File(filepath); + String parsedDocumentFile = getLuceneUtils().getTika().parseToString(documentFile); + doc.add(new Field(DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, parsedDocumentFile, TextField.TYPE_STORED)); + } catch (TikaException te) { + if (log.isErrorEnabled()) { + String message = String.format("Unable to index document '%s'", filepath); + log.error(message); + } + } + + } + getLuceneUtils().getIndexWriter().addDocument(doc); getLuceneUtils().getIndexWriter().commit(); @@ -119,32 +140,35 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { String[] words = text.split(" "); // Parse a simple query that searches for the "text": - BooleanQuery query = new BooleanQuery(); + BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder(); - BooleanQuery nameQuery = new BooleanQuery(); - BooleanQuery summaryQuery = new BooleanQuery(); - BooleanQuery authorsQuery = new BooleanQuery(); + BooleanQuery.Builder nameQueryBuilder = new BooleanQuery.Builder(); + BooleanQuery.Builder summaryQueryBuilder = new BooleanQuery.Builder(); + BooleanQuery.Builder authorsQueryBuilder = new BooleanQuery.Builder(); + BooleanQuery.Builder documentFileQueryBuilder = new BooleanQuery.Builder(); for (String word : words) { String wildWord = String.format("*%s*", word.toLowerCase()); - nameQuery.add(new WildcardQuery(new Term(DOCUMENT_NAME_INDEX_PROPERTY, wildWord)), BooleanClause.Occur.MUST); - summaryQuery.add(new WildcardQuery(new Term(DOCUMENT_SUMMARY_INDEX_PROPERTY, wildWord)), BooleanClause.Occur.MUST); - authorsQuery.add(new WildcardQuery(new Term(DOCUMENT_AUTHORS_INDEX_PROPERTY, wildWord)), BooleanClause.Occur.MUST); + nameQueryBuilder.add(new WildcardQuery(new Term(DOCUMENT_NAME_INDEX_PROPERTY, wildWord)), BooleanClause.Occur.MUST); + summaryQueryBuilder.add(new WildcardQuery(new Term(DOCUMENT_SUMMARY_INDEX_PROPERTY, wildWord)), BooleanClause.Occur.MUST); + authorsQueryBuilder.add(new WildcardQuery(new Term(DOCUMENT_AUTHORS_INDEX_PROPERTY, wildWord)), BooleanClause.Occur.MUST); + documentFileQueryBuilder.add(new WildcardQuery(new Term(DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, wildWord)), BooleanClause.Occur.MUST); } - query.add(nameQuery, BooleanClause.Occur.SHOULD); - query.add(summaryQuery, BooleanClause.Occur.SHOULD); - query.add(authorsQuery, BooleanClause.Occur.SHOULD); + queryBuilder.add(nameQueryBuilder.build(), BooleanClause.Occur.SHOULD); + queryBuilder.add(summaryQueryBuilder.build(), BooleanClause.Occur.SHOULD); + queryBuilder.add(authorsQueryBuilder.build(), BooleanClause.Occur.SHOULD); + queryBuilder.add(documentFileQueryBuilder.build(), BooleanClause.Occur.SHOULD); - query.add(new TermQuery(new Term(DOCUMENT_KEYWORD_INDEX_PROPERTY, text.toLowerCase())), BooleanClause.Occur.SHOULD); + queryBuilder.add(new TermQuery(new Term(DOCUMENT_KEYWORD_INDEX_PROPERTY, text.toLowerCase())), BooleanClause.Occur.SHOULD); // Combine that with the type - BooleanQuery fullQuery = new BooleanQuery(); - fullQuery.add(query, BooleanClause.Occur.MUST); - fullQuery.add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST); + BooleanQuery.Builder fullQueryBuilder = new BooleanQuery.Builder(); + fullQueryBuilder.add(queryBuilder.build(), BooleanClause.Occur.MUST); + fullQueryBuilder.add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST); - ScoreDoc[] hits = isearcher.search(fullQuery, null, 1000).scoreDocs; + ScoreDoc[] hits = isearcher.search(fullQueryBuilder.build(), 1000).scoreDocs; List<String> documentIds = new ArrayList(hits.length); @@ -163,42 +187,45 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { IndexSearcher isearcher = new IndexSearcher(ireader); - BooleanQuery keywordsQuery = new BooleanQuery(); + BooleanQuery.Builder keywordsQueryBuilder = new BooleanQuery.Builder(); for (String text : texts) { String[] words = text.split(" "); // Parse a simple query that searches for the "text": - BooleanQuery query = new BooleanQuery(); + BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder(); - BooleanQuery nameQuery = new BooleanQuery(); - BooleanQuery summaryQuery = new BooleanQuery(); - BooleanQuery authorsQuery = new BooleanQuery(); + BooleanQuery.Builder nameQueryBuilder = new BooleanQuery.Builder(); + BooleanQuery.Builder summaryQueryBuilder = new BooleanQuery.Builder(); + BooleanQuery.Builder authorsQueryBuilder = new BooleanQuery.Builder(); + BooleanQuery.Builder documentFileQueryBuilder = new BooleanQuery.Builder(); for (String word : words) { String wildWord = "*" + word.toLowerCase() + "*"; - nameQuery.add(new WildcardQuery(new Term(DOCUMENT_NAME_INDEX_PROPERTY, wildWord)), BooleanClause.Occur.MUST); - summaryQuery.add(new WildcardQuery(new Term(DOCUMENT_SUMMARY_INDEX_PROPERTY, wildWord)), BooleanClause.Occur.MUST); - authorsQuery.add(new WildcardQuery(new Term(DOCUMENT_AUTHORS_INDEX_PROPERTY, wildWord)), BooleanClause.Occur.MUST); + nameQueryBuilder.add(new WildcardQuery(new Term(DOCUMENT_NAME_INDEX_PROPERTY, wildWord)), BooleanClause.Occur.MUST); + summaryQueryBuilder.add(new WildcardQuery(new Term(DOCUMENT_SUMMARY_INDEX_PROPERTY, wildWord)), BooleanClause.Occur.MUST); + authorsQueryBuilder.add(new WildcardQuery(new Term(DOCUMENT_AUTHORS_INDEX_PROPERTY, wildWord)), BooleanClause.Occur.MUST); + documentFileQueryBuilder.add(new WildcardQuery(new Term(DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, wildWord)), BooleanClause.Occur.MUST); } - query.add(nameQuery, BooleanClause.Occur.SHOULD); - query.add(summaryQuery, BooleanClause.Occur.SHOULD); - query.add(authorsQuery, BooleanClause.Occur.SHOULD); + queryBuilder.add(nameQueryBuilder.build(), BooleanClause.Occur.SHOULD); + queryBuilder.add(summaryQueryBuilder.build(), BooleanClause.Occur.SHOULD); + queryBuilder.add(authorsQueryBuilder.build(), BooleanClause.Occur.SHOULD); + queryBuilder.add(documentFileQueryBuilder.build(), BooleanClause.Occur.SHOULD); - query.add(new TermQuery(new Term(DOCUMENT_KEYWORD_INDEX_PROPERTY, text.toLowerCase())), BooleanClause.Occur.SHOULD); + queryBuilder.add(new TermQuery(new Term(DOCUMENT_KEYWORD_INDEX_PROPERTY, text.toLowerCase())), BooleanClause.Occur.SHOULD); // Combine that with the type //XXX ymartel : put to Occur.SHOULD to make an "OR" - keywordsQuery.add(query, BooleanClause.Occur.MUST); + keywordsQueryBuilder.add(queryBuilder.build(), BooleanClause.Occur.MUST); } - BooleanQuery fullQuery = new BooleanQuery(); - fullQuery.add(keywordsQuery, BooleanClause.Occur.MUST); - fullQuery.add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST); + BooleanQuery.Builder fullQueryBuilder = new BooleanQuery.Builder(); + fullQueryBuilder.add(keywordsQueryBuilder.build(), BooleanClause.Occur.MUST); + fullQueryBuilder.add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST); - ScoreDoc[] hits = isearcher.search(fullQuery, null, 1000).scoreDocs; + ScoreDoc[] hits = isearcher.search(fullQueryBuilder.build(), 1000).scoreDocs; List<String> documentIds = new ArrayList(hits.length); @@ -212,16 +239,16 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { return documentIds; } - public void updateDocument(DocumentBean document) throws IOException { + public void updateDocument(DocumentBean document, String filepath) throws IOException { DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter(), false); IndexSearcher isearcher = new IndexSearcher(ireader); // Retrieve document - BooleanQuery query = new BooleanQuery(); - query.add(new TermQuery(new Term(DOCUMENT_ID_INDEX_PROPERTY, document.getId())), BooleanClause.Occur.MUST); - query.add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST); + BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder(); + queryBuilder.add(new TermQuery(new Term(DOCUMENT_ID_INDEX_PROPERTY, document.getId())), BooleanClause.Occur.MUST); + queryBuilder.add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST); - ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs; + ScoreDoc[] hits = isearcher.search(queryBuilder.build(), 1000).scoreDocs; if (hits.length > 0) { Document doc = new Document(); doc.add(new StringField(DOCUMENT_ID_INDEX_PROPERTY, document.getId(), Field.Store.YES)); @@ -249,13 +276,27 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { // Cloud Tag management if (keyword.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { - doc.add(new Field(DOCUMENT_KEYWORD_CLOUD_TAG_PROPERTY, keyword.replaceAll("'", " "), TextField.TYPE_STORED)); + doc.add(new TextField(DOCUMENT_KEYWORD_CLOUD_TAG_PROPERTY, keyword.replaceAll("'", " "), Field.Store.YES)); } } } doc.add(new Field("type", DOCUMENT_TYPE, TextField.TYPE_STORED)); + if (StringUtils.isNotBlank(filepath)) { + try { + File documentFile = new File(filepath); + String parsedDocumentFile = getLuceneUtils().getTika().parseToString(documentFile); + doc.add(new Field(DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, parsedDocumentFile, TextField.TYPE_STORED)); + } catch (TikaException te) { + if (log.isErrorEnabled()) { + String message = String.format("Unable to index document '%s'", filepath); + log.error(message); + } + } + + } + getLuceneUtils().getIndexWriter().updateDocument(new Term(DOCUMENT_ID_INDEX_PROPERTY, document.getId()), doc); getLuceneUtils().getIndexWriter().commit(); } @@ -266,11 +307,11 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { public void deleteDocument(String documentId) throws IOException { // Retrieve document - BooleanQuery query = new BooleanQuery(); - query.add(new TermQuery(new Term(DOCUMENT_ID_INDEX_PROPERTY, documentId)), BooleanClause.Occur.MUST); - query.add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST); + BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder(); + queryBuilder.add(new TermQuery(new Term(DOCUMENT_ID_INDEX_PROPERTY, documentId)), BooleanClause.Occur.MUST); + queryBuilder.add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST); - getLuceneUtils().getIndexWriter().deleteDocuments(query); + getLuceneUtils().getIndexWriter().deleteDocuments(queryBuilder.build()); getLuceneUtils().getIndexWriter().commit(); } diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/LuceneUtils.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/LuceneUtils.java index b6c0736..43a3c43 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/LuceneUtils.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/LuceneUtils.java @@ -26,22 +26,18 @@ package fr.ifremer.coselmar.services.indexation; import java.io.File; import java.io.IOException; -import java.io.InputStreamReader; import fr.ifremer.coselmar.config.CoselmarServicesConfig; -import org.apache.commons.io.IOUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.core.SimpleAnalyzer; import org.apache.lucene.analysis.en.EnglishAnalyzer; -import org.apache.lucene.analysis.fr.FrenchAnalyzer; -import org.apache.lucene.analysis.snowball.SnowballFilter; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.NIOFSDirectory; +import org.apache.tika.Tika; /** * @author ymartel <martel@codelutin.com> @@ -53,6 +49,7 @@ public class LuceneUtils { public Analyzer analyzer; public final IndexWriterConfig indexationConfig = new IndexWriterConfig(getAnalyzer()); public IndexWriter indexWriter; + protected Tika tika; protected CoselmarServicesConfig servicesConfig; @@ -81,6 +78,13 @@ public class LuceneUtils { return indexWriter; } + public Tika getTika() { + if (tika == null) { + this.tika = new Tika(); + } + return tika; + } + public void closeWriter() { if (indexWriter != null) { try { diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/AdminWebService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/AdminWebService.java index 675b49a..33b3e57 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/AdminWebService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/AdminWebService.java @@ -78,7 +78,7 @@ public class AdminWebService extends CoselmarWebServiceSupport { for (Document document : documents) { String lightId = getPersistenceContext().getTopiaIdFactory().getRandomPart(document.getTopiaId()); DocumentBean documentBean = BeanEntityConverter.toBean(getPersistenceContext().getTopiaIdFactory(), document); - documentsIndexationService.indexDocument(documentBean); + documentsIndexationService.indexDocument(documentBean, document.getFilePath()); } // Get all questions diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/DocumentsWebService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/DocumentsWebService.java index 61cf842..9a1d010 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/DocumentsWebService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/DocumentsWebService.java @@ -445,7 +445,7 @@ public class DocumentsWebService extends CoselmarWebServiceSupport { DocumentsIndexationService documentsIndexationService = getServicesContext().newService(DocumentsIndexationService.class); try { - documentsIndexationService.indexDocument(result); + documentsIndexationService.indexDocument(result, filePath); if (log.isDebugEnabled()) { String message = String.format("Document '%s' added to index", documentName); log.debug(message); @@ -471,11 +471,11 @@ public class DocumentsWebService extends CoselmarWebServiceSupport { CoselmarUser currentUser = checkUserAuthentication(authorization); String documentFullId = getFullIdFromShort(Document.class, documentId); - Document document = getDocumentDao().forTopiaIdEquals(documentFullId).findAny(); + Document documentEntity = getDocumentDao().forTopiaIdEquals(documentFullId).findAny(); // Only Owner Expert or Supervisor/Admin can add document file if (!DOCUMENT_SUPER_USER_ROLES.contains(currentUser.getRole().name()) - && document.getOwner() != currentUser) { + && documentEntity.getOwner() != currentUser) { String message = String.format("User %s %s ('%s') is not allowed to add document file", currentUser.getFirstname(), currentUser.getName(), getShortIdFromFull(currentUser.getTopiaId())); if (log.isWarnEnabled()) { @@ -485,21 +485,37 @@ public class DocumentsWebService extends CoselmarWebServiceSupport { } // Get owner to place correctly the file - CoselmarUser owner = document.getOwner(); + CoselmarUser owner = documentEntity.getOwner(); Pair<String, String> pathAndContentType = managerDocumentFile(uploadFile, owner); String filePath = pathAndContentType.getLeft(); String contentType = pathAndContentType.getRight(); // If document has already a file, remove it - if (StringUtils.isNotBlank(document.getFilePath())) { - File documentFile = new File(document.getFilePath()); + if (StringUtils.isNotBlank(documentEntity.getFilePath())) { + File documentFile = new File(documentEntity.getFilePath()); FileUtils.deleteQuietly(documentFile); } - document.setWithFile(true); - document.setMimeType(contentType); - document.setFilePath(filePath); - document.setFileName(uploadFile.getName()); + documentEntity.setWithFile(true); + documentEntity.setMimeType(contentType); + documentEntity.setFilePath(filePath); + documentEntity.setFileName(uploadFile.getName()); + + // Should update document index information to put the file + DocumentBean documentBean = BeanEntityConverter.toBean(getPersistenceContext().getTopiaIdFactory(), documentEntity); + + DocumentsIndexationService documentsIndexationService = getServicesContext().newService(DocumentsIndexationService.class); + try { + documentsIndexationService.updateDocument(documentBean, filePath); // no document file for the moment here + if (log.isDebugEnabled()) { + String message = String.format("Document '%s' was updated in index", documentEntity.getName()); + log.debug(message); + } + } catch (IOException e) { + if (log.isErrorEnabled()) { + log.error("Unable to update document index information", e); + } + } commit(); @@ -654,7 +670,7 @@ public class DocumentsWebService extends CoselmarWebServiceSupport { DocumentsIndexationService documentsIndexationService = getServicesContext().newService(DocumentsIndexationService.class); try { - documentsIndexationService.indexDocument(result); + documentsIndexationService.indexDocument(result, null); // no document file for the moment here if (log.isDebugEnabled()) { String message = String.format("Document '%s' was updated in index", document.getName()); log.debug(message); diff --git a/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationServiceTest.java b/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationServiceTest.java index 9f74687..acd53aa 100644 --- a/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationServiceTest.java +++ b/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationServiceTest.java @@ -67,7 +67,7 @@ public class DocumentsIndexationServiceTest extends AbstractCoselmarServiceTest "This is not a fake document used for test", "fr", null, "Jack, Jane", null, null, false, null, "http://somewhere", "no comment", null, null); - documentsIndexationService.indexDocument(documentOne); + documentsIndexationService.indexDocument(documentOne, null); } @@ -124,7 +124,7 @@ public class DocumentsIndexationServiceTest extends AbstractCoselmarServiceTest "This is not a fake document used for test", "fr", null, "Jack, Jane", null, null, false, null, "http://somewhere", "no comment", null, null); - documentsIndexationService.indexDocument(documentOne); + documentsIndexationService.indexDocument(documentOne, null); List<String> documentMatchingDocumentIds = documentsIndexationService.searchDocuments("document"); Assert.assertEquals(1, documentMatchingDocumentIds.size()); @@ -149,7 +149,7 @@ public class DocumentsIndexationServiceTest extends AbstractCoselmarServiceTest "This is a faked doct updated for test", "fr", null, "James, JJ", null, null, false, null, "http://somewhere", "no comment", null, null); - documentsIndexationService.updateDocument(documentOne); + documentsIndexationService.updateDocument(documentOne, null); documentMatchingDocumentIds = documentsIndexationService.searchDocuments("document"); Assert.assertTrue(documentMatchingDocumentIds.isEmpty()); @@ -222,7 +222,7 @@ public class DocumentsIndexationServiceTest extends AbstractCoselmarServiceTest "This is not a fake document used for test", "fr", null, "Jack, Jane", null, null, false, null, "http://somewhere", "no comment", null, null); - documentsIndexationService.indexDocument(documentOne); + documentsIndexationService.indexDocument(documentOne, null); String documentTwoId = "testSearchMultiple_document2"; @@ -231,7 +231,7 @@ public class DocumentsIndexationServiceTest extends AbstractCoselmarServiceTest new Date(), Lists.newArrayList("tardis", "documentation", "old", "new", "borrowed", "blue"), "testDocument", "This is part of document about the TARDIS", "fr", null, "The Doctor, Rose, Amy, River, Clara", null, null, false, null, "http://tardis.wikia.com/wiki/TARDIS", "no comment", null, null); - documentsIndexationService.indexDocument(documentTwo); + documentsIndexationService.indexDocument(documentTwo, null); List<String> documentMatchingDoctorIds = documentsIndexationService.searchDocuments(Arrays.asList("doctor")); @@ -266,7 +266,7 @@ public class DocumentsIndexationServiceTest extends AbstractCoselmarServiceTest "This is not a fake document used for test", "fr", null, "Jack, Jane", null, null, false, null, "http://somewhere", "no comment", null, null); - documentsIndexationService.indexDocument(documentOne); + documentsIndexationService.indexDocument(documentOne, null); DocumentBean documentTwo = new DocumentBean("document2", "Another document", "Amy Pond", "user002", Privacy.PUBLIC.name(), @@ -274,7 +274,7 @@ public class DocumentsIndexationServiceTest extends AbstractCoselmarServiceTest "This is just an other document used for test", "fr", null, "Amy, Rory", null, null, false, null, "http://somewhere", "no comment", null, null); - documentsIndexationService.indexDocument(documentTwo); + documentsIndexationService.indexDocument(documentTwo, null); DocumentBean documentThree = new DocumentBean("document3", "Tardis documentation", "The Doctor", "user003", Privacy.PUBLIC.name(), @@ -282,7 +282,7 @@ public class DocumentsIndexationServiceTest extends AbstractCoselmarServiceTest "This is part of documentation about the TARDIS", "fr", null, "The Doctor, Rose, Amy, River, Clara", null, null, false, null, "http://tardis.wikia.com/wiki/TARDIS", "no comment", null, null); - documentsIndexationService.indexDocument(documentThree); + documentsIndexationService.indexDocument(documentThree, null); } diff --git a/pom.xml b/pom.xml index 2c1ad62..f4bd15e 100644 --- a/pom.xml +++ b/pom.xml @@ -134,11 +134,12 @@ <nuitonValidatorVersion>3.0</nuitonValidatorVersion> <nuitonConvertorVersion>1.0</nuitonConvertorVersion> - <hibernateVersion>4.3.8.Final</hibernateVersion> - <postgresqlVersion>9.1-901-1.jdbc4</postgresqlVersion> + <hibernateVersion>4.3.11.Final</hibernateVersion> + <postgresqlVersion>9.4.1212.jre7</postgresqlVersion> <h2Version>1.4.190</h2Version> <luceneVersion>5.4.0</luceneVersion> + <tikaVersion>1.14</tikaVersion> <tomcatEmbedVersion>7.0.50</tomcatEmbedVersion> @@ -279,7 +280,7 @@ </dependency> <dependency> - <groupId>postgresql</groupId> + <groupId>org.postgresql</groupId> <artifactId>postgresql</artifactId> <version>${postgresqlVersion}</version> </dependency> @@ -328,6 +329,12 @@ <scope>runtime</scope> </dependency> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parsers</artifactId> + <version>${tikaVersion}</version> + </dependency> + <!-- Commons --> <dependency> <groupId>org.apache.commons</groupId> -- To stop receiving notification emails like this one, please contact codelutin.com SCM administrator <admin+scm@codelutin.com>.
This is an automated email from the git hooks/post-receive script. New commit to branch develop in repository coselmar. See https://gitlab.nuiton.org/codelutin/coselmar.git commit f90edbef1de138249656cf0f778863fe2ffeb654 Author: Yannick Martel <martel@©odelutin.com> Date: Mon May 22 11:16:32 2017 +0200 Upgrade lucene + revue de l'indexation des données de Question --- .../indexation/DocumentsIndexationService.java | 22 +++--- .../indexation/QuestionsIndexationService.java | 88 +++++++++++++--------- .../indexation/TransverseIndexationService.java | 6 +- .../indexation/QuestionsIndexationServiceTest.java | 22 +++--- pom.xml | 2 +- 5 files changed, 81 insertions(+), 59 deletions(-) diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java index 92402fb..87c4b68 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java @@ -24,12 +24,6 @@ package fr.ifremer.coselmar.services.indexation; * #L% */ -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.Set; - import fr.ifremer.coselmar.beans.DocumentBean; import fr.ifremer.coselmar.services.CoselmarSimpleServiceSupport; import org.apache.commons.lang3.StringUtils; @@ -50,6 +44,12 @@ import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.WildcardQuery; import org.apache.tika.exception.TikaException; +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Set; + /** * This Services provides operation about {@link fr.ifremer.coselmar.persistence.entity.Document} * or more exactly {@link fr.ifremer.coselmar.beans.DocumentBean} indexation : @@ -134,7 +134,7 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { } public List<String> searchDocuments(String text) throws IOException, ParseException { - DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter(), false); + DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter()); IndexSearcher isearcher = new IndexSearcher(ireader); String[] words = text.split(" "); @@ -183,7 +183,7 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { } public List<String> searchDocuments(List<String> texts) throws IOException, ParseException { - DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter(), false); + DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter()); IndexSearcher isearcher = new IndexSearcher(ireader); @@ -240,7 +240,7 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { } public void updateDocument(DocumentBean document, String filepath) throws IOException { - DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter(), false); + DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter()); IndexSearcher isearcher = new IndexSearcher(ireader); // Retrieve document @@ -287,7 +287,9 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { try { File documentFile = new File(filepath); String parsedDocumentFile = getLuceneUtils().getTika().parseToString(documentFile); - doc.add(new Field(DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, parsedDocumentFile, TextField.TYPE_STORED)); + if (StringUtils.isNotBlank(parsedDocumentFile)) { + doc.add(new Field(DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, parsedDocumentFile, TextField.TYPE_STORED)); + } } catch (TikaException te) { if (log.isErrorEnabled()) { String message = String.format("Unable to index document '%s'", filepath); diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java index 2c97773..b5e92d3 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java @@ -26,15 +26,19 @@ package fr.ifremer.coselmar.services.indexation; import fr.ifremer.coselmar.beans.QuestionBean; import fr.ifremer.coselmar.beans.QuestionSearchBean; -import fr.ifremer.coselmar.beans.QuestionSearchExample; import fr.ifremer.coselmar.services.CoselmarSimpleServiceSupport; import org.apache.commons.lang3.StringUtils; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.misc.HighFreqTerms; import org.apache.lucene.misc.HighFreqTermsMultiFields; import org.apache.lucene.misc.TermStats; @@ -46,6 +50,7 @@ import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.util.BytesRef; import java.io.IOException; import java.util.ArrayList; @@ -81,10 +86,23 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { protected static final String QUESTION_THEME_CLOUD_TAG_PROPERTY = "questionCloudTagTheme"; protected static final String DOCUMENT_TYPE = "questionindextype"; + public static final FieldType TYPE_STORED = new FieldType(); + static { + TYPE_STORED.setOmitNorms(true); + TYPE_STORED.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + TYPE_STORED.setStored(true); + TYPE_STORED.setStoreTermVectors(true); + TYPE_STORED.setStoreTermVectorPositions(true); + TYPE_STORED.setStoreTermVectorOffsets(true); + TYPE_STORED.setStoreTermVectorPayloads(true); + TYPE_STORED.setTokenized(true); + TYPE_STORED.freeze(); + } + public void indexQuestion(QuestionBean question) throws IOException { // First : try to find if already exist to update it - DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter(), false); + DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter()); IndexSearcher isearcher = new IndexSearcher(ireader); // Retrieve document @@ -102,11 +120,11 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { doc.add(new StringField(QUESTION_ID_INDEX_PROPERTY, question.getId(), Field.Store.YES)); doc.add(new TextField(QUESTION_TITLE_INDEX_PROPERTY, questionTitle, Field.Store.YES)); - doc.add(new TextField(QUESTION_SUMMARY_INDEX_PROPERTY, questionSummary, Field.Store.YES)); + doc.add(new Field(QUESTION_SUMMARY_INDEX_PROPERTY, questionSummary, TYPE_STORED)); // Cloud Tag management if (questionTitle.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { - doc.add(new TextField(QUESTION_TITLE_CLOUD_TAG_PROPERTY, questionTitle.replaceAll("'", " "), Field.Store.YES)); + doc.add(new Field(QUESTION_TITLE_CLOUD_TAG_PROPERTY, questionTitle.replaceAll("'", " "), TYPE_STORED)); } // if (questionSummary.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { // doc.add(new TextField(QUESTION_SUMMARY_CLOUD_TAG_PROPERTY, questionSummary, Field.Store.YES)); @@ -119,7 +137,7 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { // Cloud Tag management if (theme.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { - doc.add(new Field(QUESTION_THEME_CLOUD_TAG_PROPERTY, theme.replaceAll("'", " "), TextField.TYPE_STORED)); + doc.add(new Field(QUESTION_THEME_CLOUD_TAG_PROPERTY, theme.replaceAll("'", " "), TYPE_STORED)); } } } @@ -139,14 +157,14 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { doc.add(new StringField(QUESTION_ID_INDEX_PROPERTY, question.getId(), Field.Store.YES)); doc.add(new TextField(QUESTION_TITLE_INDEX_PROPERTY, questionTitle, Field.Store.YES)); - doc.add(new TextField(QUESTION_SUMMARY_INDEX_PROPERTY, questionSummary, Field.Store.YES)); + doc.add(new Field(QUESTION_SUMMARY_INDEX_PROPERTY, questionSummary, TYPE_STORED)); doc.add(new TextField(QUESTION_STATUS_INDEX_PROPERTY, question.getStatus(), Field.Store.YES)); doc.add(new TextField(QUESTION_PRIVACY_INDEX_PROPERTY, question.getPrivacy(), Field.Store.YES)); // Cloud Tag management if (questionTitle.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { - doc.add(new TextField(QUESTION_TITLE_CLOUD_TAG_PROPERTY, questionTitle.replaceAll("'", " "), Field.Store.YES)); + doc.add(new Field(QUESTION_TITLE_CLOUD_TAG_PROPERTY, questionTitle.replaceAll("'", " "), TYPE_STORED)); } // if (questionSummary.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { // doc.add(new TextField(QUESTION_SUMMARY_CLOUD_TAG_PROPERTY, questionSummary, Field.Store.YES)); @@ -159,7 +177,7 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { // Cloud Tag management if (theme.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { - doc.add(new Field(QUESTION_THEME_CLOUD_TAG_PROPERTY, theme.replaceAll("'", " "), TextField.TYPE_STORED)); + doc.add(new Field(QUESTION_THEME_CLOUD_TAG_PROPERTY, theme.replaceAll("'", " "), TYPE_STORED)); } } } @@ -177,7 +195,7 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { } public List<String> searchQuestion(QuestionSearchBean searchBean) throws IOException, ParseException { - DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter(), false); + DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter()); IndexSearcher isearcher = new IndexSearcher(ireader); // Combine that with the type @@ -272,7 +290,7 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { public Map<String, Long> getTopTerms() throws IOException, ParseException { - DirectoryReader indexReader = DirectoryReader.open(getLuceneUtils().getIndexWriter(), false); + DirectoryReader indexReader = DirectoryReader.open(getLuceneUtils().getIndexWriter()); Map<String, Long> result = new LinkedHashMap<>(); try { @@ -297,46 +315,48 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { return result; } - public Map<String, Long> getTopDocumentsTerms(List<String> questionIds) throws IOException, ParseException { + public Map<String, Long> getTopQuestionsTerms(List<String> questionIds) throws IOException, ParseException { - DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter(), false); + DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter()); IndexSearcher isearcher = new IndexSearcher(ireader); // Combine that with the type BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder(); queryBuilder.add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST); + BooleanQuery.Builder questionIdBuilder = new BooleanQuery.Builder(); for (String questionId : questionIds) { if(StringUtils.isNotBlank(questionId)) { - queryBuilder.add(new TermQuery(new Term(QUESTION_ID_INDEX_PROPERTY, questionId.toLowerCase())), BooleanClause.Occur.SHOULD); + questionIdBuilder.add(new TermQuery(new Term(QUESTION_ID_INDEX_PROPERTY, questionId.toLowerCase())), BooleanClause.Occur.SHOULD); } } + queryBuilder.add(questionIdBuilder.build(), BooleanClause.Occur.MUST); TopDocs hits = isearcher.search(queryBuilder.build(), 100); ScoreDoc[] scoreDocs = hits.scoreDocs; - System.out.println("hits=" + scoreDocs.length); - System.out.println("Hits (rank,score,docId)"); - for (int n = 0; n < scoreDocs.length; ++n) { - ScoreDoc sd = scoreDocs[n]; - float score = sd.score; - int docId = sd.doc; - } - -// TopFieldCollector topFieldCollector = TopFieldCollector.create(new Sort(), 100, true, true, false); -// isearcher.search(queryBuilder.build(), topFieldCollector); Map<String, Long> result = new LinkedHashMap<>(); -// TopFieldDocs topField = topFieldCollector.topDocs(); -// for (SortField sortField : topField.fields) { -// String field = sortField.getField(); -// long sumDocFreq = ireader.getSumDocFreq(field); -// -// if (result.containsKey(field)) { -// result.put(field, result.get(field) + sumDocFreq); -// } else { -// result.put(field, sumDocFreq); -// } -// } + + for (ScoreDoc scoreDoc : scoreDocs) { + Fields termVectors = ireader.getTermVectors(scoreDoc.doc); + for (String termVector : termVectors) { + System.out.println("Vector: " + termVector); + Terms vector = ireader.getTermVector(scoreDoc.doc, termVector); + TermsEnum termsEnum = vector.iterator(); + BytesRef bytesRef = termsEnum.next(); + while(bytesRef != null){ + String term = bytesRef.utf8ToString(); + long totalTermFreq = termsEnum.totalTermFreq(); + + if (result.containsKey(term)) { + result.put(term, result.get(term) + totalTermFreq); + } else { + result.put(term, totalTermFreq); + } + bytesRef = termsEnum.next(); + } + } + } ireader.close(); return result; diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/TransverseIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/TransverseIndexationService.java index db6ae9b..2eceac3 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/TransverseIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/TransverseIndexationService.java @@ -38,7 +38,6 @@ import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.TermQuery; import java.io.IOException; -import java.util.Comparator; import java.util.LinkedHashMap; import java.util.Map; import java.util.TreeMap; @@ -73,7 +72,7 @@ public class TransverseIndexationService extends CoselmarSimpleServiceSupport { public Map<String, Long> getTopTerms() throws IOException, ParseException { - DirectoryReader indexReader = DirectoryReader.open(getLuceneUtils().getIndexWriter(), false); + DirectoryReader indexReader = DirectoryReader.open(getLuceneUtils().getIndexWriter()); Map<String, Long> topWords = new LinkedHashMap<>(); try { @@ -81,7 +80,8 @@ public class TransverseIndexationService extends CoselmarSimpleServiceSupport { QuestionsIndexationService.QUESTION_TITLE_CLOUD_TAG_PROPERTY, QuestionsIndexationService.QUESTION_THEME_CLOUD_TAG_PROPERTY, DocumentsIndexationService.DOCUMENT_NAME_CLOUD_TAG_PROPERTY, - DocumentsIndexationService.DOCUMENT_KEYWORD_CLOUD_TAG_PROPERTY + DocumentsIndexationService.DOCUMENT_KEYWORD_CLOUD_TAG_PROPERTY, + DocumentsIndexationService.DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, }; TermStats[] highFreqTerms = HighFreqTermsMultiFields.getHighFreqTermsMultiFields(indexReader, 40, searchedFields, new HighFreqTerms.TotalTermFreqComparator()); diff --git a/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationServiceTest.java b/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationServiceTest.java index df99838..2d2d154 100644 --- a/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationServiceTest.java +++ b/coselmar-rest/src/test/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationServiceTest.java @@ -24,12 +24,6 @@ package fr.ifremer.coselmar.services.indexation; * #L% */ -import java.util.Arrays; -import java.util.Date; -import java.util.List; -import java.util.Locale; -import java.util.Map; - import com.google.common.collect.Sets; import fr.ifremer.coselmar.beans.QuestionBean; import fr.ifremer.coselmar.beans.QuestionSearchBean; @@ -43,6 +37,12 @@ import org.junit.Before; import org.junit.Test; import org.nuiton.util.DateUtil; +import java.util.Arrays; +import java.util.Date; +import java.util.List; +import java.util.Locale; +import java.util.Map; + /** * @author ymartel <martel@codelutin.com> */ @@ -463,12 +463,12 @@ public class QuestionsIndexationServiceTest extends AbstractCoselmarServiceTest questionsIndexationService.indexQuestion(questionTwo); // Ok, let's search now ! - Map<String, Long> topTerms = questionsIndexationService.getTopDocumentsTerms(Arrays.asList(questionOneId)); + Map<String, Long> topTerms = questionsIndexationService.getTopQuestionsTerms(Arrays.asList(questionOneId)); Assert.assertNotNull(topTerms); -// Assert.assertEquals(1, topTerms.get("question").longValue()); -// Assert.assertEquals(2, topTerms.get("tardis").longValue()); -// Assert.assertEquals(2, topTerms.get("time").longValue()); -// Assert.assertEquals(1, topTerms.get("space").longValue()); + Assert.assertEquals(1, topTerms.get("question").longValue()); + Assert.assertEquals(2, topTerms.get("tardis").longValue()); + Assert.assertEquals(2, topTerms.get("time").longValue()); + Assert.assertEquals(1, topTerms.get("space").longValue()); } } diff --git a/pom.xml b/pom.xml index f4bd15e..20fb3bd 100644 --- a/pom.xml +++ b/pom.xml @@ -138,7 +138,7 @@ <postgresqlVersion>9.4.1212.jre7</postgresqlVersion> <h2Version>1.4.190</h2Version> - <luceneVersion>5.4.0</luceneVersion> + <luceneVersion>6.5.1</luceneVersion> <tikaVersion>1.14</tikaVersion> <tomcatEmbedVersion>7.0.50</tomcatEmbedVersion> -- To stop receiving notification emails like this one, please contact codelutin.com SCM administrator <admin+scm@codelutin.com>.
This is an automated email from the git hooks/post-receive script. New commit to branch develop in repository coselmar. See https://gitlab.nuiton.org/codelutin/coselmar.git commit c616ba1fb803d57b19b2c1dd0c5f1e0e62108b2b Author: Yannick Martel <martel@©odelutin.com> Date: Mon May 22 16:08:11 2017 +0200 refs #9197 use vectors on document fields indexation --- .../indexation/DocumentsIndexationService.java | 20 ++++++++--------- .../coselmar/services/indexation/LuceneUtils.java | 15 +++++++++++++ .../indexation/QuestionsIndexationService.java | 25 ++++++---------------- 3 files changed, 31 insertions(+), 29 deletions(-) diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java index 87c4b68..e218577 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java @@ -87,16 +87,16 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { String documentName = document.getName(); String documentSummary = document.getSummary(); - doc.add(new TextField(DOCUMENT_NAME_INDEX_PROPERTY, documentName, Field.Store.YES)); + doc.add(new Field(DOCUMENT_NAME_INDEX_PROPERTY, documentName, LuceneUtils.TYPE_STORED)); if (StringUtils.isNotBlank(document.getAuthors())) { doc.add(new TextField(DOCUMENT_AUTHORS_INDEX_PROPERTY, document.getAuthors(), Field.Store.YES)); } - doc.add(new TextField(DOCUMENT_SUMMARY_INDEX_PROPERTY, documentSummary, Field.Store.YES)); + doc.add(new Field(DOCUMENT_SUMMARY_INDEX_PROPERTY, documentSummary, LuceneUtils.TYPE_STORED)); doc.add(new Field("type", DOCUMENT_TYPE, TextField.TYPE_STORED)); // Cloud Tag management if (documentName.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { - doc.add(new TextField(DOCUMENT_NAME_CLOUD_TAG_PROPERTY, documentName.replaceAll("'", " "), Field.Store.YES)); + doc.add(new Field(DOCUMENT_NAME_CLOUD_TAG_PROPERTY, documentName.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } // if (documentSummary.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { // doc.add(new TextField(DOCUMENT_SUMMARY_CLOUD_TAG_PROPERTY, documentSummary, Field.Store.YES)); @@ -109,7 +109,7 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { // Cloud Tag management if (keyword.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { - doc.add(new Field(DOCUMENT_KEYWORD_CLOUD_TAG_PROPERTY, keyword.replaceAll("'", " "), TextField.TYPE_STORED)); + doc.add(new Field(DOCUMENT_KEYWORD_CLOUD_TAG_PROPERTY, keyword.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } } } @@ -118,7 +118,7 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { try { File documentFile = new File(filepath); String parsedDocumentFile = getLuceneUtils().getTika().parseToString(documentFile); - doc.add(new Field(DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, parsedDocumentFile, TextField.TYPE_STORED)); + doc.add(new Field(DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, parsedDocumentFile, LuceneUtils.TYPE_STORED)); } catch (TikaException te) { if (log.isErrorEnabled()) { String message = String.format("Unable to index document '%s'", filepath); @@ -255,15 +255,15 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { String documentName = document.getName(); String documentSummary = document.getSummary(); - doc.add(new TextField(DOCUMENT_NAME_INDEX_PROPERTY, documentName, Field.Store.YES)); + doc.add(new Field(DOCUMENT_NAME_INDEX_PROPERTY, documentName, LuceneUtils.TYPE_STORED)); if (StringUtils.isNotBlank(document.getAuthors())) { doc.add(new TextField(DOCUMENT_AUTHORS_INDEX_PROPERTY, document.getAuthors(), Field.Store.YES)); } - doc.add(new TextField(DOCUMENT_SUMMARY_INDEX_PROPERTY, documentSummary, Field.Store.YES)); + doc.add(new Field(DOCUMENT_SUMMARY_INDEX_PROPERTY, documentSummary, LuceneUtils.TYPE_STORED)); // Cloud Tag management if (documentName.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { - doc.add(new TextField(DOCUMENT_NAME_CLOUD_TAG_PROPERTY, documentName.replaceAll("'", " "), Field.Store.YES)); + doc.add(new Field(DOCUMENT_NAME_CLOUD_TAG_PROPERTY, documentName.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } // if (documentSummary.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { // doc.add(new TextField(DOCUMENT_SUMMARY_CLOUD_TAG_PROPERTY, documentSummary, Field.Store.YES)); @@ -276,7 +276,7 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { // Cloud Tag management if (keyword.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { - doc.add(new TextField(DOCUMENT_KEYWORD_CLOUD_TAG_PROPERTY, keyword.replaceAll("'", " "), Field.Store.YES)); + doc.add(new Field(DOCUMENT_KEYWORD_CLOUD_TAG_PROPERTY, keyword.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } } } @@ -288,7 +288,7 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { File documentFile = new File(filepath); String parsedDocumentFile = getLuceneUtils().getTika().parseToString(documentFile); if (StringUtils.isNotBlank(parsedDocumentFile)) { - doc.add(new Field(DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, parsedDocumentFile, TextField.TYPE_STORED)); + doc.add(new Field(DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, parsedDocumentFile, LuceneUtils.TYPE_STORED)); } } catch (TikaException te) { if (log.isErrorEnabled()) { diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/LuceneUtils.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/LuceneUtils.java index 43a3c43..26224cf 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/LuceneUtils.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/LuceneUtils.java @@ -33,6 +33,8 @@ import org.apache.commons.logging.LogFactory; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; @@ -51,6 +53,19 @@ public class LuceneUtils { public IndexWriter indexWriter; protected Tika tika; + public static final FieldType TYPE_STORED = new FieldType(); + static { + TYPE_STORED.setOmitNorms(true); + TYPE_STORED.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + TYPE_STORED.setStored(true); + TYPE_STORED.setStoreTermVectors(true); + TYPE_STORED.setStoreTermVectorPositions(true); + TYPE_STORED.setStoreTermVectorOffsets(true); + TYPE_STORED.setStoreTermVectorPayloads(true); + TYPE_STORED.setTokenized(true); + TYPE_STORED.freeze(); + } + protected CoselmarServicesConfig servicesConfig; public LuceneUtils(CoselmarServicesConfig servicesConfig) { diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java index b5e92d3..f60ce70 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java @@ -86,19 +86,6 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { protected static final String QUESTION_THEME_CLOUD_TAG_PROPERTY = "questionCloudTagTheme"; protected static final String DOCUMENT_TYPE = "questionindextype"; - public static final FieldType TYPE_STORED = new FieldType(); - static { - TYPE_STORED.setOmitNorms(true); - TYPE_STORED.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - TYPE_STORED.setStored(true); - TYPE_STORED.setStoreTermVectors(true); - TYPE_STORED.setStoreTermVectorPositions(true); - TYPE_STORED.setStoreTermVectorOffsets(true); - TYPE_STORED.setStoreTermVectorPayloads(true); - TYPE_STORED.setTokenized(true); - TYPE_STORED.freeze(); - } - public void indexQuestion(QuestionBean question) throws IOException { // First : try to find if already exist to update it @@ -120,11 +107,11 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { doc.add(new StringField(QUESTION_ID_INDEX_PROPERTY, question.getId(), Field.Store.YES)); doc.add(new TextField(QUESTION_TITLE_INDEX_PROPERTY, questionTitle, Field.Store.YES)); - doc.add(new Field(QUESTION_SUMMARY_INDEX_PROPERTY, questionSummary, TYPE_STORED)); + doc.add(new Field(QUESTION_SUMMARY_INDEX_PROPERTY, questionSummary, LuceneUtils.TYPE_STORED)); // Cloud Tag management if (questionTitle.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { - doc.add(new Field(QUESTION_TITLE_CLOUD_TAG_PROPERTY, questionTitle.replaceAll("'", " "), TYPE_STORED)); + doc.add(new Field(QUESTION_TITLE_CLOUD_TAG_PROPERTY, questionTitle.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } // if (questionSummary.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { // doc.add(new TextField(QUESTION_SUMMARY_CLOUD_TAG_PROPERTY, questionSummary, Field.Store.YES)); @@ -137,7 +124,7 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { // Cloud Tag management if (theme.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { - doc.add(new Field(QUESTION_THEME_CLOUD_TAG_PROPERTY, theme.replaceAll("'", " "), TYPE_STORED)); + doc.add(new Field(QUESTION_THEME_CLOUD_TAG_PROPERTY, theme.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } } } @@ -157,14 +144,14 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { doc.add(new StringField(QUESTION_ID_INDEX_PROPERTY, question.getId(), Field.Store.YES)); doc.add(new TextField(QUESTION_TITLE_INDEX_PROPERTY, questionTitle, Field.Store.YES)); - doc.add(new Field(QUESTION_SUMMARY_INDEX_PROPERTY, questionSummary, TYPE_STORED)); + doc.add(new Field(QUESTION_SUMMARY_INDEX_PROPERTY, questionSummary, LuceneUtils.TYPE_STORED)); doc.add(new TextField(QUESTION_STATUS_INDEX_PROPERTY, question.getStatus(), Field.Store.YES)); doc.add(new TextField(QUESTION_PRIVACY_INDEX_PROPERTY, question.getPrivacy(), Field.Store.YES)); // Cloud Tag management if (questionTitle.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { - doc.add(new Field(QUESTION_TITLE_CLOUD_TAG_PROPERTY, questionTitle.replaceAll("'", " "), TYPE_STORED)); + doc.add(new Field(QUESTION_TITLE_CLOUD_TAG_PROPERTY, questionTitle.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } // if (questionSummary.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { // doc.add(new TextField(QUESTION_SUMMARY_CLOUD_TAG_PROPERTY, questionSummary, Field.Store.YES)); @@ -177,7 +164,7 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { // Cloud Tag management if (theme.length() >= TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { - doc.add(new Field(QUESTION_THEME_CLOUD_TAG_PROPERTY, theme.replaceAll("'", " "), TYPE_STORED)); + doc.add(new Field(QUESTION_THEME_CLOUD_TAG_PROPERTY, theme.replaceAll("'", " "), LuceneUtils.TYPE_STORED)); } } } -- To stop receiving notification emails like this one, please contact codelutin.com SCM administrator <admin+scm@codelutin.com>.
This is an automated email from the git hooks/post-receive script. New commit to branch develop in repository coselmar. See https://gitlab.nuiton.org/codelutin/coselmar.git commit 1e43d560da4e23e5d45163f59ae9f9573e936ab5 Author: Yannick Martel <martel@©odelutin.com> Date: Mon May 22 18:05:10 2017 +0200 refs #9197 use lucene to make cloud tag on question page : data from question and its documents including file content --- .../indexation/DocumentsIndexationService.java | 56 ++++++++++++++ .../indexation/QuestionsIndexationService.java | 15 ++-- .../coselmar/services/v1/QuestionsWebService.java | 85 ++++++++++++++++------ 3 files changed, 127 insertions(+), 29 deletions(-) diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java index e218577..224311d 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java @@ -34,20 +34,27 @@ import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.Fields; import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.util.BytesRef; import org.apache.tika.exception.TikaException; import java.io.File; import java.io.IOException; import java.util.ArrayList; +import java.util.LinkedHashMap; import java.util.List; +import java.util.Map; import java.util.Set; /** @@ -329,4 +336,53 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { } + public Map<String, Long> getTopDocumentsTerms(List<String> questionIds) throws IOException { + + DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter()); + IndexSearcher isearcher = new IndexSearcher(ireader); + + // Combine that with the type + BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder(); + queryBuilder.add(new TermQuery(new Term("type", DOCUMENT_TYPE)), BooleanClause.Occur.MUST); + + BooleanQuery.Builder questionIdBuilder = new BooleanQuery.Builder(); + for (String questionId : questionIds) { + if(StringUtils.isNotBlank(questionId)) { + questionIdBuilder.add(new TermQuery(new Term(DOCUMENT_ID_INDEX_PROPERTY, questionId.toLowerCase())), BooleanClause.Occur.SHOULD); + } + } + queryBuilder.add(questionIdBuilder.build(), BooleanClause.Occur.MUST); + + TopDocs hits = isearcher.search(queryBuilder.build(), 100); + ScoreDoc[] scoreDocs = hits.scoreDocs; + + Map<String, Long> result = new LinkedHashMap<>(); + + for (ScoreDoc scoreDoc : scoreDocs) { + Fields termVectors = ireader.getTermVectors(scoreDoc.doc); + for (String termVector : termVectors) { + Terms vector = ireader.getTermVector(scoreDoc.doc, termVector); + TermsEnum termsEnum = vector.iterator(); + BytesRef bytesRef = termsEnum.next(); + while(bytesRef != null){ + String term = bytesRef.utf8ToString().toLowerCase(); + long totalTermFreq = termsEnum.totalTermFreq(); + + if (term.length() > TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { + if (result.containsKey(term)) { + result.put(term, result.get(term) + totalTermFreq); + } else { + result.put(term, totalTermFreq); + } + } + bytesRef = termsEnum.next(); + } + } + } + + ireader.close(); + return result; + } + + } diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java index f60ce70..32bb35b 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/QuestionsIndexationService.java @@ -302,7 +302,7 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { return result; } - public Map<String, Long> getTopQuestionsTerms(List<String> questionIds) throws IOException, ParseException { + public Map<String, Long> getTopQuestionsTerms(List<String> questionIds) throws IOException { DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter()); IndexSearcher isearcher = new IndexSearcher(ireader); @@ -327,18 +327,19 @@ public class QuestionsIndexationService extends CoselmarSimpleServiceSupport { for (ScoreDoc scoreDoc : scoreDocs) { Fields termVectors = ireader.getTermVectors(scoreDoc.doc); for (String termVector : termVectors) { - System.out.println("Vector: " + termVector); Terms vector = ireader.getTermVector(scoreDoc.doc, termVector); TermsEnum termsEnum = vector.iterator(); BytesRef bytesRef = termsEnum.next(); while(bytesRef != null){ - String term = bytesRef.utf8ToString(); + String term = bytesRef.utf8ToString().toLowerCase(); long totalTermFreq = termsEnum.totalTermFreq(); - if (result.containsKey(term)) { - result.put(term, result.get(term) + totalTermFreq); - } else { - result.put(term, totalTermFreq); + if (term.length() > TransverseIndexationService.CLOUD_TAG_WORD_MIN_SIZE) { + if (result.containsKey(term)) { + result.put(term, result.get(term) + totalTermFreq); + } else { + result.put(term, totalTermFreq); + } } bytesRef = termsEnum.next(); } diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/QuestionsWebService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/QuestionsWebService.java index 384aa7b..e7a9d48 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/QuestionsWebService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/QuestionsWebService.java @@ -24,15 +24,6 @@ package fr.ifremer.coselmar.services.v1; * #L% */ -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.Date; -import java.util.HashSet; -import java.util.List; -import java.util.Set; - import com.google.common.base.Function; import com.google.common.base.Preconditions; import com.google.common.collect.Collections2; @@ -65,6 +56,7 @@ import fr.ifremer.coselmar.services.CoselmarWebServiceSupport; import fr.ifremer.coselmar.services.errors.InvalidCredentialException; import fr.ifremer.coselmar.services.errors.NoResultException; import fr.ifremer.coselmar.services.errors.UnauthorizedException; +import fr.ifremer.coselmar.services.indexation.DocumentsIndexationService; import fr.ifremer.coselmar.services.indexation.QuestionsIndexationService; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; @@ -79,6 +71,16 @@ import org.nuiton.util.DateUtil; import org.nuiton.util.pagination.PaginationParameter; import org.nuiton.util.pagination.PaginationResult; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Date; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + /** * @author ymartel <martel@codelutin.com> */ @@ -1069,25 +1071,52 @@ public class QuestionsWebService extends CoselmarWebServiceSupport { // Check authentication String authorization = getContext().getHeader("Authorization"); - UserWebToken userWebToken = checkAuthentication(authorization); + CoselmarUser user = checkUserAuthentication(authorization); - // Check current user - String fullCurrentUserId = getFullUserIdFromShort(userWebToken.getUserId()); - getCoselmarUserDao().forTopiaIdEquals(fullCurrentUserId).findAny(); + // Retrieve Question + String fullQuestionId = getFullIdFromShort(Question.class, questionId); + Question question = getQuestionDao().forTopiaIdEquals(fullQuestionId).findUnique(); - List<CloudWord> topWords; - if (getCoselmarServicesConfig().isPostgresqlDatabase()) { + List<CloudWord> topWords = new ArrayList<>(); +// if (getCoselmarServicesConfig().isPostgresqlDatabase()) { +// try { +// topWords = getQuestionDao().findTopWords(getFullIdFromShort(Question.class, questionId)); +// } catch (TopiaNoResultException e) { +// if (log.isErrorEnabled()) { +// log.error("Try to find top words for non existing questionId" + questionId, e); +// } +// throw new NoResultException("Question does not exist"); +// } +// } else { +// topWords = Collections.EMPTY_LIST; + + QuestionsIndexationService questionsIndexationService = getServicesContext().newService(QuestionsIndexationService.class); + DocumentsIndexationService documentsIndexationService = getServicesContext().newService(DocumentsIndexationService.class); try { - topWords = getQuestionDao().findTopWords(getFullIdFromShort(Question.class, questionId)); - } catch (TopiaNoResultException e) { + Map<String, Long> topQuestionsTerms = questionsIndexationService.getTopQuestionsTerms(Lists.newArrayList(questionId)); + List<String> shortDocumentIds = getShortDocumentIds(question); + Map<String, Long> topDocumentsTerms = documentsIndexationService.getTopDocumentsTerms(shortDocumentIds); + for (Map.Entry<String, Long> documentTermFreq : topDocumentsTerms.entrySet()) { + String term = documentTermFreq.getKey(); + Long frequence = documentTermFreq.getValue(); + if (topQuestionsTerms.containsKey(term)) { + } else { + topQuestionsTerms.put(term, frequence); + } + } + + for (Map.Entry<String, Long> termFreq : topQuestionsTerms.entrySet()) { + String term = termFreq.getKey(); + CloudWord cloudWord = new CloudWord(term, termFreq.getValue()); + topWords.add(cloudWord); + } + + } catch (IOException e) { if (log.isErrorEnabled()) { - log.error("Try to find top words for non existing questionId" + questionId, e); + log.error("Unable to index new question", e); } - throw new NoResultException("Question does not exist"); } - } else { - topWords = Collections.EMPTY_LIST; - } +// } return topWords; } @@ -1542,4 +1571,16 @@ public class QuestionsWebService extends CoselmarWebServiceSupport { return result; } + protected List<String> getShortDocumentIds(Question question) { + List<String> shortDocumentIds = new ArrayList<>(); + for (String relatedDocumentId : question.getRelatedDocumentsTopiaIds()) { + String shortIdFromFull = getShortIdFromFull(relatedDocumentId); + shortDocumentIds.add(shortIdFromFull); + } + for (String closingDocumentId : question.getClosingDocumentsTopiaIds()) { + String shortIdFromFull = getShortIdFromFull(closingDocumentId); + shortDocumentIds.add(shortIdFromFull); + } + return shortDocumentIds; + } } -- To stop receiving notification emails like this one, please contact codelutin.com SCM administrator <admin+scm@codelutin.com>.
This is an automated email from the git hooks/post-receive script. New commit to branch develop in repository coselmar. See https://gitlab.nuiton.org/codelutin/coselmar.git commit c4e9b3e7d51a79043dfb935091ba4d0f6550dbc1 Merge: b25dc5e 1e43d56 Author: Yannick Martel <martel@©odelutin.com> Date: Wed May 24 11:04:54 2017 +0200 Merge branch 'feature/9197-Indexation_documents' into develop coselmar-rest/pom.xml | 8 +- .../indexation/DocumentsIndexationService.java | 215 +++++++++++++++------ .../coselmar/services/indexation/LuceneUtils.java | 29 ++- .../indexation/QuestionsIndexationService.java | 76 ++++---- .../indexation/TransverseIndexationService.java | 6 +- .../coselmar/services/v1/AdminWebService.java | 2 +- .../coselmar/services/v1/DocumentsWebService.java | 38 ++-- .../coselmar/services/v1/QuestionsWebService.java | 85 +++++--- .../indexation/DocumentsIndexationServiceTest.java | 16 +- .../indexation/QuestionsIndexationServiceTest.java | 22 +-- pom.xml | 15 +- 11 files changed, 354 insertions(+), 158 deletions(-) -- To stop receiving notification emails like this one, please contact codelutin.com SCM administrator <admin+scm@codelutin.com>.
participants (1)
-
codelutin.com scm