branch feature/comparatif-lunce-pg created (now d9939cf)
This is an automated email from the git hooks/post-receive script. New change to branch feature/comparatif-lunce-pg in repository coselmar. See https://gitlab.nuiton.org/codelutin/coselmar.git at d9939cf Ajout du fileContent dans la table Document et comparatif de recherche entre lucene et pg pour les topterms This branch includes the following new commits: new d9939cf Ajout du fileContent dans la table Document et comparatif de recherche entre lucene et pg pour les topterms The 1 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "adds" were already present in the repository and have only been added to this reference. Detailed log of new commits: commit d9939cf96d1db441460b8e4809061cfdf2035761 Author: Yannick Martel <martel@©odelutin.com> Date: Wed May 24 12:03:05 2017 +0200 Ajout du fileContent dans la table Document et comparatif de recherche entre lucene et pg pour les topterms -- To stop receiving notification emails like this one, please contact codelutin.com SCM administrator <admin+scm@codelutin.com>.
This is an automated email from the git hooks/post-receive script. New commit to branch feature/comparatif-lunce-pg in repository coselmar. See https://gitlab.nuiton.org/codelutin/coselmar.git commit d9939cf96d1db441460b8e4809061cfdf2035761 Author: Yannick Martel <martel@©odelutin.com> Date: Wed May 24 12:03:05 2017 +0200 Ajout du fileContent dans la table Document et comparatif de recherche entre lucene et pg pour les topterms --- .../persistence/entity/QuestionTopiaDao.java | 15 +- ...V2_1_0_1__9197_add_fileContent_in_documents.sql | 23 +++ .../src/main/xmi/coselmar-model.properties | 1 + .../src/main/xmi/coselmar-model.zargo | Bin 11007 -> 11150 bytes .../indexation/DocumentsIndexationService.java | 16 +- .../coselmar/services/v1/AdminWebService.java | 21 ++- .../coselmar/services/v1/DocumentsWebService.java | 33 +++- .../services/v1/ExperimentationService.java | 180 +++++++++++++++++++++ coselmar-rest/src/main/resources/mapping | 2 + 9 files changed, 267 insertions(+), 24 deletions(-) diff --git a/coselmar-persistence/src/main/java/fr/ifremer/coselmar/persistence/entity/QuestionTopiaDao.java b/coselmar-persistence/src/main/java/fr/ifremer/coselmar/persistence/entity/QuestionTopiaDao.java index 8c66d18..8faba2f 100644 --- a/coselmar-persistence/src/main/java/fr/ifremer/coselmar/persistence/entity/QuestionTopiaDao.java +++ b/coselmar-persistence/src/main/java/fr/ifremer/coselmar/persistence/entity/QuestionTopiaDao.java @@ -369,12 +369,13 @@ public class QuestionTopiaDao extends AbstractQuestionTopiaDao<Question> { private final String sql; private final String getSql(String questionId) { - return "SELECT word, nentry FROM ts_stat( ' select to_tsvector(''public.simple_english_conf'', q.title)" + - " || to_tsvector(''public.simple_english_conf'', q.summary)" + - " || to_tsvector(''public.simple_english_conf'', qt.theme)" + - " || COALESCE(to_tsvector(''public.simple_english_conf'', d.name),'''')" + - " || COALESCE(to_tsvector(''public.simple_english_conf'', dk.keywords),'''')" + - " || COALESCE(to_tsvector(''public.simple_english_conf'', d.summary),'''') FROM question q" + + return "SELECT word, nentry FROM ts_stat( ' select to_tsvector(''public.simple_english_conf'', q." + Question.PROPERTY_TITLE + ")" + + " || to_tsvector(''public.simple_english_conf'', q." + Question.PROPERTY_SUMMARY + ")" + + " || to_tsvector(''public.simple_english_conf'', qt." + Question.PROPERTY_THEME + ")" + + " || COALESCE(to_tsvector(''public.simple_english_conf'', d." + Document.PROPERTY_NAME + "),'''')" + + " || COALESCE(to_tsvector(''public.simple_english_conf'', dk." + Document.PROPERTY_KEYWORDS + "),'''')" + + " || COALESCE(to_tsvector(''public.simple_english_conf'', d." + Document.PROPERTY_SUMMARY + "),'''')" + + " || COALESCE(to_tsvector(''public.simple_english_conf'', d." + Document.PROPERTY_FILE_CONTENT + "),'''') FROM question q" + " LEFT JOIN relateddocuments_relatedquestion ON" + " relateddocuments_relatedquestion.relatedquestion = q.topiaid" + " LEFT JOIN closingdocuments_relatedquestion ON" + @@ -390,7 +391,7 @@ public class QuestionTopiaDao extends AbstractQuestionTopiaDao<Question> { " ')" + " WHERE char_length(word) > 3 " + " ORDER BY nentry DESC " + - " LIMIT 30"; + " "; } QuestionTermStatSqlQuery(String questionId) { diff --git a/coselmar-persistence/src/main/resources/db/migration/V2_1_0_1__9197_add_fileContent_in_documents.sql b/coselmar-persistence/src/main/resources/db/migration/V2_1_0_1__9197_add_fileContent_in_documents.sql new file mode 100644 index 0000000..f8a64c7 --- /dev/null +++ b/coselmar-persistence/src/main/resources/db/migration/V2_1_0_1__9197_add_fileContent_in_documents.sql @@ -0,0 +1,23 @@ +--- +-- #%L +-- Coselmar :: Persistence +-- %% +-- Copyright (C) 2014 - 2016 Ifremer, Code Lutin +-- %% +-- This program is free software: you can redistribute it and/or modify +-- it under the terms of the GNU General Public License as +-- published by the Free Software Foundation, either version 3 of the +-- License, or (at your option) any later version. +-- +-- This program is distributed in the hope that it will be useful, +-- but WITHOUT ANY WARRANTY; without even the implied warranty of +-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +-- GNU General Public License for more details. +-- +-- You should have received a copy of the GNU General Public +-- License along with this program. If not, see +-- <http://www.gnu.org/licenses/gpl-3.0.html>. +-- #L% +--- + +ALTER TABLE document ADD fileContent TEXT; diff --git a/coselmar-persistence/src/main/xmi/coselmar-model.properties b/coselmar-persistence/src/main/xmi/coselmar-model.properties index 1ee6952..5419513 100644 --- a/coselmar-persistence/src/main/xmi/coselmar-model.properties +++ b/coselmar-persistence/src/main/xmi/coselmar-model.properties @@ -30,6 +30,7 @@ model.tagvalue.useEnumerationName=true fr.ifremer.coselmar.persistence.entity.Document.attribute.summary.tagValue.hibernateAttributeType=text fr.ifremer.coselmar.persistence.entity.Document.attribute.comment.tagValue.hibernateAttributeType=text fr.ifremer.coselmar.persistence.entity.Document.attribute.citation.tagValue.hibernateAttributeType=text +fr.ifremer.coselmar.persistence.entity.Document.attribute.fileContent.tagValue.hibernateAttributeType=text fr.ifremer.coselmar.persistence.entity.Question.attribute.summary.tagValue.hibernateAttributeType=text fr.ifremer.coselmar.persistence.entity.Question.attribute.conclusion.tagValue.hibernateAttributeType=text \ No newline at end of file diff --git a/coselmar-persistence/src/main/xmi/coselmar-model.zargo b/coselmar-persistence/src/main/xmi/coselmar-model.zargo index 85a95b3..c2eac82 100644 Binary files a/coselmar-persistence/src/main/xmi/coselmar-model.zargo and b/coselmar-persistence/src/main/xmi/coselmar-model.zargo differ diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java index 224311d..242af91 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java @@ -86,7 +86,7 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { protected static final String DOCUMENT_FILE_CONTENT_INDEX_PROPERTY = "documentFileContent"; protected static final String DOCUMENT_TYPE = "documentindextype"; - public void indexDocument(DocumentBean document, String filepath) throws IOException { + public void indexDocument(DocumentBean document, String fileContent) throws IOException { Document doc = new Document(); doc.add(new StringField(DOCUMENT_ID_INDEX_PROPERTY, document.getId(), Field.Store.YES)); @@ -121,18 +121,8 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { } } - if (StringUtils.isNotBlank(filepath)) { - try { - File documentFile = new File(filepath); - String parsedDocumentFile = getLuceneUtils().getTika().parseToString(documentFile); - doc.add(new Field(DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, parsedDocumentFile, LuceneUtils.TYPE_STORED)); - } catch (TikaException te) { - if (log.isErrorEnabled()) { - String message = String.format("Unable to index document '%s'", filepath); - log.error(message); - } - } - + if (StringUtils.isNotBlank(fileContent)) { + doc.add(new Field(DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, fileContent, LuceneUtils.TYPE_STORED)); } getLuceneUtils().getIndexWriter().addDocument(doc); diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/AdminWebService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/AdminWebService.java index 33b3e57..98ad677 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/AdminWebService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/AdminWebService.java @@ -24,6 +24,7 @@ package fr.ifremer.coselmar.services.v1; * #L% */ +import java.io.File; import java.io.IOException; import java.util.List; @@ -42,6 +43,7 @@ import fr.ifremer.coselmar.services.indexation.DocumentsIndexationService; import fr.ifremer.coselmar.services.indexation.QuestionsIndexationService; import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; +import org.apache.tika.exception.TikaException; import static org.apache.commons.logging.LogFactory.getLog; @@ -76,10 +78,25 @@ public class AdminWebService extends CoselmarWebServiceSupport { // get All documents List<Document> documents = getDocumentDao().findAll(); for (Document document : documents) { - String lightId = getPersistenceContext().getTopiaIdFactory().getRandomPart(document.getTopiaId()); DocumentBean documentBean = BeanEntityConverter.toBean(getPersistenceContext().getTopiaIdFactory(), document); - documentsIndexationService.indexDocument(documentBean, document.getFilePath()); + String filePath = document.getFilePath(); + String fileContent = null; + try { + fileContent = getServicesContext().getLuceneUtils().getTika().parseToString(new File(filePath)); + document.setFileContent(fileContent); + getDocumentDao().update(document); + } catch (IOException e) { + if (log.isErrorEnabled()) { + log.error("Unable to read uploaded file " + filePath, e); + } + } catch (TikaException e) { + if (log.isErrorEnabled()) { + log.error("Unable to get file content from Tika : " + filePath, e); + } + } + documentsIndexationService.indexDocument(documentBean, fileContent); } + commit(); // Get all questions List<Question> questions = getQuestionDao().findAll(); diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/DocumentsWebService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/DocumentsWebService.java index 9a1d010..d8bda0c 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/DocumentsWebService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/DocumentsWebService.java @@ -54,6 +54,7 @@ import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.Pair; import org.apache.commons.logging.Log; import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.tika.exception.TikaException; import org.debux.webmotion.server.call.UploadFile; import org.debux.webmotion.server.render.Render; import org.nuiton.topia.persistence.TopiaNoResultException; @@ -65,6 +66,7 @@ import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; +import java.net.URL; import java.util.ArrayList; import java.util.Collection; import java.util.Date; @@ -376,12 +378,24 @@ public class DocumentsWebService extends CoselmarWebServiceSupport { String documentName = document.getName(); String contentType = null; String filePath = null; + String fileContent = null; // If document has a file, manager it ! if (uploadFile != null) { Pair<String, String> pathAndContentType = managerDocumentFile(uploadFile, owner); filePath = pathAndContentType.getLeft(); contentType = pathAndContentType.getRight(); + try { + fileContent = getServicesContext().getLuceneUtils().getTika().parseToString(new File(filePath)); + } catch (IOException e) { + if (log.isErrorEnabled()) { + log.error("Unable to read uploaded file " + filePath, e); + } + } catch (TikaException e) { + if (log.isErrorEnabled()) { + log.error("Unable to get file content from Tika : " + filePath, e); + } + } } // Document Metadata @@ -431,6 +445,7 @@ public class DocumentsWebService extends CoselmarWebServiceSupport { documentEntity.setWithFile(true); documentEntity.setMimeType(contentType); documentEntity.setFilePath(filePath); + documentEntity.setFileContent(fileContent); } else { documentEntity.setWithFile(false); } @@ -445,7 +460,7 @@ public class DocumentsWebService extends CoselmarWebServiceSupport { DocumentsIndexationService documentsIndexationService = getServicesContext().newService(DocumentsIndexationService.class); try { - documentsIndexationService.indexDocument(result, filePath); + documentsIndexationService.indexDocument(result, fileContent); if (log.isDebugEnabled()) { String message = String.format("Document '%s' added to index", documentName); log.debug(message); @@ -489,6 +504,19 @@ public class DocumentsWebService extends CoselmarWebServiceSupport { Pair<String, String> pathAndContentType = managerDocumentFile(uploadFile, owner); String filePath = pathAndContentType.getLeft(); String contentType = pathAndContentType.getRight(); + String fileContent = null; + // Read file content + try { + fileContent = getServicesContext().getLuceneUtils().getTika().parseToString(new File(filePath)); + } catch (IOException e) { + if (log.isErrorEnabled()) { + log.error("Unable to read uploaded file " + filePath, e); + } + } catch (TikaException e) { + if (log.isErrorEnabled()) { + log.error("Unable to get file content from Tika : " + filePath, e); + } + } // If document has already a file, remove it if (StringUtils.isNotBlank(documentEntity.getFilePath())) { @@ -500,13 +528,14 @@ public class DocumentsWebService extends CoselmarWebServiceSupport { documentEntity.setMimeType(contentType); documentEntity.setFilePath(filePath); documentEntity.setFileName(uploadFile.getName()); + documentEntity.setFileContent(fileContent); // Should update document index information to put the file DocumentBean documentBean = BeanEntityConverter.toBean(getPersistenceContext().getTopiaIdFactory(), documentEntity); DocumentsIndexationService documentsIndexationService = getServicesContext().newService(DocumentsIndexationService.class); try { - documentsIndexationService.updateDocument(documentBean, filePath); // no document file for the moment here + documentsIndexationService.updateDocument(documentBean, fileContent); if (log.isDebugEnabled()) { String message = String.format("Document '%s' was updated in index", documentEntity.getName()); log.debug(message); diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/ExperimentationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/ExperimentationService.java new file mode 100644 index 0000000..48cba7f --- /dev/null +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/ExperimentationService.java @@ -0,0 +1,180 @@ +package fr.ifremer.coselmar.services.v1; + +/* + * #%L + * Coselmar :: Rest Services + * $Id:$ + * $HeadURL:$ + * %% + * Copyright (C) 2014 Ifremer, Code Lutin + * %% + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program. If not, see + * <http://www.gnu.org/licenses/gpl-3.0.html>. + * #L% + */ + +import com.google.common.base.Function; +import com.google.common.base.Preconditions; +import com.google.common.collect.Collections2; +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; +import fr.ifremer.coselmar.beans.CloudWord; +import fr.ifremer.coselmar.beans.DocumentBean; +import fr.ifremer.coselmar.beans.LinkBean; +import fr.ifremer.coselmar.beans.QuestionBean; +import fr.ifremer.coselmar.beans.QuestionExportModel; +import fr.ifremer.coselmar.beans.QuestionSearchBean; +import fr.ifremer.coselmar.beans.QuestionSearchExample; +import fr.ifremer.coselmar.beans.QuestionTreeNode; +import fr.ifremer.coselmar.beans.QuestionUserRole; +import fr.ifremer.coselmar.beans.UserBean; +import fr.ifremer.coselmar.beans.UserWebToken; +import fr.ifremer.coselmar.converter.BeanEntityConverter; +import fr.ifremer.coselmar.exceptions.CoselmarTechnicalException; +import fr.ifremer.coselmar.persistence.entity.CoselmarUser; +import fr.ifremer.coselmar.persistence.entity.CoselmarUserGroup; +import fr.ifremer.coselmar.persistence.entity.CoselmarUserRole; +import fr.ifremer.coselmar.persistence.entity.Document; +import fr.ifremer.coselmar.persistence.entity.Link; +import fr.ifremer.coselmar.persistence.entity.LinkImpl; +import fr.ifremer.coselmar.persistence.entity.Privacy; +import fr.ifremer.coselmar.persistence.entity.Question; +import fr.ifremer.coselmar.persistence.entity.QuestionImpl; +import fr.ifremer.coselmar.persistence.entity.Status; +import fr.ifremer.coselmar.services.CoselmarWebServiceSupport; +import fr.ifremer.coselmar.services.errors.InvalidCredentialException; +import fr.ifremer.coselmar.services.errors.NoResultException; +import fr.ifremer.coselmar.services.errors.UnauthorizedException; +import fr.ifremer.coselmar.services.indexation.DocumentsIndexationService; +import fr.ifremer.coselmar.services.indexation.QuestionsIndexationService; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.lucene.queryparser.classic.ParseException; +import org.debux.webmotion.server.render.Render; +import org.nuiton.csv.Export; +import org.nuiton.topia.persistence.TopiaIdFactory; +import org.nuiton.topia.persistence.TopiaNoResultException; +import org.nuiton.util.DateUtil; +import org.nuiton.util.pagination.PaginationParameter; +import org.nuiton.util.pagination.PaginationResult; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Date; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * @author ymartel <martel@codelutin.com> + */ +public class ExperimentationService extends CoselmarWebServiceSupport { + + private static final Log log = LogFactory.getLog(ExperimentationService.class); + + public List<String> evaluateTopWordsGeneration() { + Question question = getQuestionDao().forStatusEquals(Status.IN_PROGRESS).findAny(); + long start = System.currentTimeMillis(); + String questionId = getShortIdFromFull(question.getTopiaId()); + System.out.println("Question : " + questionId); + List<CloudWord> luceneTopWords = getLuceneTopWords(questionId); + long stop = System.currentTimeMillis(); + String luceneTiming = String.format("Recherche par Lucene : %d termes en %d ms", luceneTopWords.size(), stop - start); + start = System.currentTimeMillis(); + List<CloudWord> postgresTopWords = getPostgresTopWords(questionId); + stop = System.currentTimeMillis(); + String pgTiming = String.format("Recherche par Postgresql : %d termes en %d ms", postgresTopWords.size(), stop - start); + + return Lists.newArrayList(luceneTiming, pgTiming); + } + + public List<CloudWord> getLuceneTopWords(String questionId) { + + // Retrieve Question + String fullQuestionId = getFullIdFromShort(Question.class, questionId); + Question question = getQuestionDao().forTopiaIdEquals(fullQuestionId).findUnique(); + + List<CloudWord> topWords = new ArrayList<>(); + + QuestionsIndexationService questionsIndexationService = getServicesContext().newService(QuestionsIndexationService.class); + DocumentsIndexationService documentsIndexationService = getServicesContext().newService(DocumentsIndexationService.class); + try { + Map<String, Long> topQuestionsTerms = questionsIndexationService.getTopQuestionsTerms(Lists.newArrayList(questionId)); + List<String> shortDocumentIds = getShortDocumentIds(question); + Map<String, Long> topDocumentsTerms = documentsIndexationService.getTopDocumentsTerms(shortDocumentIds); + for (Map.Entry<String, Long> documentTermFreq : topDocumentsTerms.entrySet()) { + String term = documentTermFreq.getKey(); + Long frequence = documentTermFreq.getValue(); + if (topQuestionsTerms.containsKey(term)) { + } else { + topQuestionsTerms.put(term, frequence); + } + } + + for (Map.Entry<String, Long> termFreq : topQuestionsTerms.entrySet()) { + String term = termFreq.getKey(); + CloudWord cloudWord = new CloudWord(term, termFreq.getValue()); + topWords.add(cloudWord); + } + + } catch (IOException e) { + if (log.isErrorEnabled()) { + log.error("Unable to index new question", e); + } + } + + return topWords; + } + + public List<CloudWord> getPostgresTopWords(String questionId) { + + List<CloudWord> topWords; + if (getCoselmarServicesConfig().isPostgresqlDatabase()) { + try { + topWords = getQuestionDao().findTopWords(getFullIdFromShort(Question.class, questionId)); + } catch (TopiaNoResultException e) { + if (log.isErrorEnabled()) { + log.error("Try to find top words for non existing questionId" + questionId, e); + } + topWords = Collections.EMPTY_LIST; + } + } else { + topWords = Collections.EMPTY_LIST; + } + + return topWords; + } + + //////////////////////////////////////////////////////////////////////////// + /////////////////////// Internal Parts ///////////////////////////// + //////////////////////////////////////////////////////////////////////////// + + protected List<String> getShortDocumentIds(Question question) { + List<String> shortDocumentIds = new ArrayList<>(); + for (String relatedDocumentId : question.getRelatedDocumentsTopiaIds()) { + String shortIdFromFull = getShortIdFromFull(relatedDocumentId); + shortDocumentIds.add(shortIdFromFull); + } + for (String closingDocumentId : question.getClosingDocumentsTopiaIds()) { + String shortIdFromFull = getShortIdFromFull(closingDocumentId); + shortDocumentIds.add(shortIdFromFull); + } + return shortDocumentIds; + } +} diff --git a/coselmar-rest/src/main/resources/mapping b/coselmar-rest/src/main/resources/mapping index b498a97..147d53d 100644 --- a/coselmar-rest/src/main/resources/mapping +++ b/coselmar-rest/src/main/resources/mapping @@ -76,6 +76,8 @@ GET /v1/questions/{questionId}/topwords QuestionsWebService.getTopWords GET /v1/general/topwords GeneralWebService.getTopWords +GET /v1/experimentation/topwords ExperimentationService.evaluateTopWordsGeneration + # Admin API POST /v1/admin/lucene/index AdminWebService.refreshLuceneIndex -- To stop receiving notification emails like this one, please contact codelutin.com SCM administrator <admin+scm@codelutin.com>.
participants (1)
-
codelutin.com scm