branch develop updated (f432be6 -> f37fe4f)
This is an automated email from the git hooks/post-receive script. New change to branch develop in repository coselmar. See https://gitlab.nuiton.org/codelutin/coselmar.git from f432be6 Exclusion de bcprov-jdk15on de la dépendance apache tika (conflit avec la version de webmotion?) adds a763b05 Ajout du fileContent dans la table Document et comparatif de recherche entre lucene et pg pour les topterms adds 56f5389 Rest-Expose lucene and pg versions of question topterms methods adds 0dd6ace Review cloudtag request from postgresql and fix indexation from lucene new ba59165 Remove ExperimentationService new f37fe4f refs #9197 review way to use Tika, filter file content indexation for pdf, opendocument text/presentation and ms word/powerpoint The 2 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "adds" were already present in the repository and have only been added to this reference. Detailed log of new commits: commit f37fe4f513d6bd4c8f8e52ae2086873baba477c2 Author: Yannick Martel <martel@©odelutin.com> Date: Tue May 30 17:08:16 2017 +0200 refs #9197 review way to use Tika, filter file content indexation for pdf, opendocument text/presentation and ms word/powerpoint commit ba5916563162056a6ee21bf374daa732aec98947 Author: Yannick Martel <martel@©odelutin.com> Date: Tue May 30 15:58:28 2017 +0200 Remove ExperimentationService Summary of changes: .../java/fr/ifremer/coselmar/beans/CloudWord.java | 5 ++ .../persistence/entity/QuestionTopiaDao.java | 40 +++++++-------- ...2_1_0_1__9197_add_fileContent_in_documents.sql} | 2 +- .../src/main/xmi/coselmar-model.properties | 1 + .../src/main/xmi/coselmar-model.zargo | Bin 11007 -> 11150 bytes .../indexation/DocumentsIndexationService.java | 47 +++++------------- .../coselmar/services/indexation/LuceneUtils.java | 7 --- .../coselmar/services/indexation/TikaUtils.java | 54 +++++++++++++++++++++ .../coselmar/services/v1/AdminWebService.java | 12 ++++- .../coselmar/services/v1/DocumentsWebService.java | 15 ++++-- .../coselmar/services/v1/QuestionsWebService.java | 33 ++++++++----- 11 files changed, 135 insertions(+), 81 deletions(-) copy coselmar-persistence/src/main/resources/db/migration/{V1_4_0_1__7911_add_citation_in_documents.sql => V2_1_0_1__9197_add_fileContent_in_documents.sql} (94%) create mode 100644 coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/TikaUtils.java -- To stop receiving notification emails like this one, please contact codelutin.com SCM administrator <admin+scm@codelutin.com>.
This is an automated email from the git hooks/post-receive script. New commit to branch develop in repository coselmar. See https://gitlab.nuiton.org/codelutin/coselmar.git commit ba5916563162056a6ee21bf374daa732aec98947 Author: Yannick Martel <martel@©odelutin.com> Date: Tue May 30 15:58:28 2017 +0200 Remove ExperimentationService --- .../services/v1/ExperimentationService.java | 191 --------------------- coselmar-rest/src/main/resources/mapping | 4 - 2 files changed, 195 deletions(-) diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/ExperimentationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/ExperimentationService.java deleted file mode 100644 index 3c83b41..0000000 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/ExperimentationService.java +++ /dev/null @@ -1,191 +0,0 @@ -package fr.ifremer.coselmar.services.v1; - -/* - * #%L - * Coselmar :: Rest Services - * $Id:$ - * $HeadURL:$ - * %% - * Copyright (C) 2014 Ifremer, Code Lutin - * %% - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as - * published by the Free Software Foundation, either version 3 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program. If not, see - * <http://www.gnu.org/licenses/gpl-3.0.html>. - * #L% - */ - -import com.google.common.base.Function; -import com.google.common.base.Preconditions; -import com.google.common.collect.Collections2; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.Lists; -import com.google.common.collect.Ordering; -import com.google.common.collect.Sets; -import com.rometools.rome.feed.rss.Cloud; -import fr.ifremer.coselmar.beans.CloudWord; -import fr.ifremer.coselmar.beans.DocumentBean; -import fr.ifremer.coselmar.beans.LinkBean; -import fr.ifremer.coselmar.beans.QuestionBean; -import fr.ifremer.coselmar.beans.QuestionExportModel; -import fr.ifremer.coselmar.beans.QuestionSearchBean; -import fr.ifremer.coselmar.beans.QuestionSearchExample; -import fr.ifremer.coselmar.beans.QuestionTreeNode; -import fr.ifremer.coselmar.beans.QuestionUserRole; -import fr.ifremer.coselmar.beans.UserBean; -import fr.ifremer.coselmar.beans.UserWebToken; -import fr.ifremer.coselmar.converter.BeanEntityConverter; -import fr.ifremer.coselmar.exceptions.CoselmarTechnicalException; -import fr.ifremer.coselmar.persistence.entity.CoselmarUser; -import fr.ifremer.coselmar.persistence.entity.CoselmarUserGroup; -import fr.ifremer.coselmar.persistence.entity.CoselmarUserRole; -import fr.ifremer.coselmar.persistence.entity.Document; -import fr.ifremer.coselmar.persistence.entity.Link; -import fr.ifremer.coselmar.persistence.entity.LinkImpl; -import fr.ifremer.coselmar.persistence.entity.Privacy; -import fr.ifremer.coselmar.persistence.entity.Question; -import fr.ifremer.coselmar.persistence.entity.QuestionImpl; -import fr.ifremer.coselmar.persistence.entity.Status; -import fr.ifremer.coselmar.services.CoselmarWebServiceSupport; -import fr.ifremer.coselmar.services.errors.InvalidCredentialException; -import fr.ifremer.coselmar.services.errors.NoResultException; -import fr.ifremer.coselmar.services.errors.UnauthorizedException; -import fr.ifremer.coselmar.services.indexation.DocumentsIndexationService; -import fr.ifremer.coselmar.services.indexation.QuestionsIndexationService; -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.lucene.queryparser.classic.ParseException; -import org.debux.webmotion.server.render.Render; -import org.nuiton.csv.Export; -import org.nuiton.topia.persistence.TopiaIdFactory; -import org.nuiton.topia.persistence.TopiaNoResultException; -import org.nuiton.util.DateUtil; -import org.nuiton.util.pagination.PaginationParameter; -import org.nuiton.util.pagination.PaginationResult; - -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.Date; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; - -/** - * @author ymartel <martel@codelutin.com> - */ -public class ExperimentationService extends CoselmarWebServiceSupport { - - private static final Log log = LogFactory.getLog(ExperimentationService.class); - - public List<String> evaluateTopWordsGeneration() { - Question question = getQuestionDao().forStatusEquals(Status.IN_PROGRESS).findAny(); - long start = System.currentTimeMillis(); - String questionId = getShortIdFromFull(question.getTopiaId()); - System.out.println("Question : " + questionId); - List<CloudWord> luceneTopWords = getLuceneTopWords(questionId); - long stop = System.currentTimeMillis(); - String luceneTiming = String.format("Recherche par Lucene : %d termes en %d ms", luceneTopWords.size(), stop - start); - start = System.currentTimeMillis(); - List<CloudWord> postgresTopWords = getPostgresTopWords(questionId); - stop = System.currentTimeMillis(); - String pgTiming = String.format("Recherche par Postgresql : %d termes en %d ms", postgresTopWords.size(), stop - start); - - return Lists.newArrayList(luceneTiming, pgTiming); - } - - public List<CloudWord> getLuceneTopWords(String questionId) { - - // Retrieve Question - String fullQuestionId = getFullIdFromShort(Question.class, questionId); - Question question = getQuestionDao().forTopiaIdEquals(fullQuestionId).findUnique(); - - List<CloudWord> topWords = new ArrayList<>(); - - QuestionsIndexationService questionsIndexationService = getServicesContext().newService(QuestionsIndexationService.class); - DocumentsIndexationService documentsIndexationService = getServicesContext().newService(DocumentsIndexationService.class); - try { - Map<String, Long> topQuestionsTerms = questionsIndexationService.getTopQuestionsTerms(Lists.newArrayList(questionId)); - List<String> shortDocumentIds = getShortDocumentIds(question); - Map<String, Long> topDocumentsTerms = documentsIndexationService.getTopDocumentsTerms(shortDocumentIds); - for (Map.Entry<String, Long> documentTermFreq : topDocumentsTerms.entrySet()) { - String term = documentTermFreq.getKey(); - Long frequence = documentTermFreq.getValue(); - if (topQuestionsTerms.containsKey(term)) { - topQuestionsTerms.put(term, topQuestionsTerms.get(term) + frequence); - } else { - topQuestionsTerms.put(term, frequence); - } - } - - for (Map.Entry<String, Long> termFreq : topQuestionsTerms.entrySet()) { - String term = termFreq.getKey(); - CloudWord cloudWord = new CloudWord(term, termFreq.getValue()); - topWords.add(cloudWord); - } - - } catch (IOException e) { - if (log.isErrorEnabled()) { - log.error("Unable to index new question", e); - } - } - - ImmutableList<CloudWord> cloudWords = ImmutableList.copyOf(Ordering.natural().onResultOf(new Function<CloudWord, Long>() { - public Long apply(CloudWord input) { - return input.getWeight(); - } - }).reverse().sortedCopy(topWords)); - - return cloudWords; - } - - public List<CloudWord> getPostgresTopWords(String questionId) { - - List<CloudWord> topWords; - if (getCoselmarServicesConfig().isPostgresqlDatabase()) { - try { - topWords = getQuestionDao().findTopWords(getFullIdFromShort(Question.class, questionId)); - } catch (TopiaNoResultException e) { - if (log.isErrorEnabled()) { - log.error("Try to find top words for non existing questionId" + questionId, e); - } - topWords = Collections.EMPTY_LIST; - } - } else { - topWords = Collections.EMPTY_LIST; - } - - return topWords; - } - - //////////////////////////////////////////////////////////////////////////// - /////////////////////// Internal Parts ///////////////////////////// - //////////////////////////////////////////////////////////////////////////// - - protected List<String> getShortDocumentIds(Question question) { - List<String> shortDocumentIds = new ArrayList<>(); - for (String relatedDocumentId : question.getRelatedDocumentsTopiaIds()) { - String shortIdFromFull = getShortIdFromFull(relatedDocumentId); - shortDocumentIds.add(shortIdFromFull); - } - for (String closingDocumentId : question.getClosingDocumentsTopiaIds()) { - String shortIdFromFull = getShortIdFromFull(closingDocumentId); - shortDocumentIds.add(shortIdFromFull); - } - return shortDocumentIds; - } -} diff --git a/coselmar-rest/src/main/resources/mapping b/coselmar-rest/src/main/resources/mapping index 27e8e5b..b498a97 100644 --- a/coselmar-rest/src/main/resources/mapping +++ b/coselmar-rest/src/main/resources/mapping @@ -76,10 +76,6 @@ GET /v1/questions/{questionId}/topwords QuestionsWebService.getTopWords GET /v1/general/topwords GeneralWebService.getTopWords -GET /v1/experimentation/topwords ExperimentationService.evaluateTopWordsGeneration -GET /v1/experimentation/lucenetopwords/{questionId} ExperimentationService.getLuceneTopWords -GET /v1/experimentation/pgtopwords/{questionId} ExperimentationService.getPostgresTopWords - # Admin API POST /v1/admin/lucene/index AdminWebService.refreshLuceneIndex -- To stop receiving notification emails like this one, please contact codelutin.com SCM administrator <admin+scm@codelutin.com>.
This is an automated email from the git hooks/post-receive script. New commit to branch develop in repository coselmar. See https://gitlab.nuiton.org/codelutin/coselmar.git commit f37fe4f513d6bd4c8f8e52ae2086873baba477c2 Author: Yannick Martel <martel@©odelutin.com> Date: Tue May 30 17:08:16 2017 +0200 refs #9197 review way to use Tika, filter file content indexation for pdf, opendocument text/presentation and ms word/powerpoint --- .../indexation/DocumentsIndexationService.java | 21 ++------- .../coselmar/services/indexation/LuceneUtils.java | 7 --- .../coselmar/services/indexation/TikaUtils.java | 54 ++++++++++++++++++++++ .../coselmar/services/v1/AdminWebService.java | 21 +++------ .../coselmar/services/v1/DocumentsWebService.java | 28 ++--------- 5 files changed, 69 insertions(+), 62 deletions(-) diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java index 158c10f..f820531 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/DocumentsIndexationService.java @@ -48,6 +48,7 @@ import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.util.BytesRef; import org.apache.tika.exception.TikaException; +import org.apache.tika.mime.MimeType; import java.io.File; import java.io.IOException; @@ -122,7 +123,7 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { } if (StringUtils.isNotBlank(fileContent)) { - doc.add(new Field(DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, fileContent, LuceneUtils.TYPE_STORED)); + doc.add(new Field(DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, fileContent, LuceneUtils.TYPE_STORED)); } getLuceneUtils().getIndexWriter().addDocument(doc); @@ -236,7 +237,7 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { return documentIds; } - public void updateDocument(DocumentBean document, String filepath) throws IOException { + public void updateDocument(DocumentBean document, String fileContent) throws IOException { DirectoryReader ireader = DirectoryReader.open(getLuceneUtils().getIndexWriter()); IndexSearcher isearcher = new IndexSearcher(ireader); @@ -280,20 +281,8 @@ public class DocumentsIndexationService extends CoselmarSimpleServiceSupport { doc.add(new Field("type", DOCUMENT_TYPE, TextField.TYPE_STORED)); - if (StringUtils.isNotBlank(filepath)) { - try { - File documentFile = new File(filepath); - String parsedDocumentFile = getLuceneUtils().getTika().parseToString(documentFile); - if (StringUtils.isNotBlank(parsedDocumentFile)) { - doc.add(new Field(DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, parsedDocumentFile, LuceneUtils.TYPE_STORED)); - } - } catch (TikaException te) { - if (log.isErrorEnabled()) { - String message = String.format("Unable to index document '%s'", filepath); - log.error(message); - } - } - + if (StringUtils.isNotBlank(fileContent)) { + doc.add(new Field(DOCUMENT_FILE_CONTENT_INDEX_PROPERTY, fileContent, LuceneUtils.TYPE_STORED)); } getLuceneUtils().getIndexWriter().updateDocument(new Term(DOCUMENT_ID_INDEX_PROPERTY, document.getId()), doc); diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/LuceneUtils.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/LuceneUtils.java index 26224cf..6e6b0ef 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/LuceneUtils.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/LuceneUtils.java @@ -93,13 +93,6 @@ public class LuceneUtils { return indexWriter; } - public Tika getTika() { - if (tika == null) { - this.tika = new Tika(); - } - return tika; - } - public void closeWriter() { if (indexWriter != null) { try { diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/TikaUtils.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/TikaUtils.java new file mode 100644 index 0000000..eb05bfe --- /dev/null +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/indexation/TikaUtils.java @@ -0,0 +1,54 @@ +package fr.ifremer.coselmar.services.indexation; + +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.tika.Tika; +import org.apache.tika.exception.TikaException; + +import java.io.File; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +/** + * @author ymartel (martel@codelutin.com) + */ +public class TikaUtils { + + private static final Log log = LogFactory.getLog(TikaUtils.class); + + public static final List<String> READABLE_TEXT_MIMETYPES = Arrays.asList("text/plain", + "application/pdt", + "application/vnd.oasis.opendocument.text", + "application/vnd.oasis.opendocument.presentation", + "application/msword", + "application/mspowerpoint", + "application/powerpoint", + "application/vnd.ms-powerpoint", + "text/html" + ); + + private static final Tika tika = new Tika(); + + public static String getFileContent(String filePath) { + String fileContent = ""; + File file = new File(filePath); + try { + String mimeType = tika.detect(file); + // Can we read it ? + if (StringUtils.isNotBlank(mimeType) && READABLE_TEXT_MIMETYPES.contains(mimeType.toLowerCase())) { + fileContent = tika.parseToString(file); + } + } catch (IOException e) { + if (log.isErrorEnabled()) { + log.error("Unable to read file " + filePath, e); + } + } catch (TikaException e) { + if (log.isErrorEnabled()) { + log.error("Unable to get file content from Tika : " + filePath, e); + } + } + return fileContent; + } +} diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/AdminWebService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/AdminWebService.java index 98ad677..f39ab29 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/AdminWebService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/AdminWebService.java @@ -41,6 +41,7 @@ import fr.ifremer.coselmar.services.errors.InvalidCredentialException; import fr.ifremer.coselmar.services.errors.UnauthorizedException; import fr.ifremer.coselmar.services.indexation.DocumentsIndexationService; import fr.ifremer.coselmar.services.indexation.QuestionsIndexationService; +import fr.ifremer.coselmar.services.indexation.TikaUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.tika.exception.TikaException; @@ -79,22 +80,12 @@ public class AdminWebService extends CoselmarWebServiceSupport { List<Document> documents = getDocumentDao().findAll(); for (Document document : documents) { DocumentBean documentBean = BeanEntityConverter.toBean(getPersistenceContext().getTopiaIdFactory(), document); - String filePath = document.getFilePath(); - String fileContent = null; - try { - fileContent = getServicesContext().getLuceneUtils().getTika().parseToString(new File(filePath)); - document.setFileContent(fileContent); - getDocumentDao().update(document); - } catch (IOException e) { - if (log.isErrorEnabled()) { - log.error("Unable to read uploaded file " + filePath, e); - } - } catch (TikaException e) { - if (log.isErrorEnabled()) { - log.error("Unable to get file content from Tika : " + filePath, e); - } - } + // Refresh file information + String fileContent = TikaUtils.getFileContent(document.getFilePath()); documentsIndexationService.indexDocument(documentBean, fileContent); + // Refresh database content + document.setFileContent(fileContent); + getDocumentDao().update(document); } commit(); diff --git a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/DocumentsWebService.java b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/DocumentsWebService.java index d8bda0c..63e133d 100644 --- a/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/DocumentsWebService.java +++ b/coselmar-rest/src/main/java/fr/ifremer/coselmar/services/v1/DocumentsWebService.java @@ -49,6 +49,7 @@ import fr.ifremer.coselmar.services.errors.InvalidCredentialException; import fr.ifremer.coselmar.services.errors.NoResultException; import fr.ifremer.coselmar.services.errors.UnauthorizedException; import fr.ifremer.coselmar.services.indexation.DocumentsIndexationService; +import fr.ifremer.coselmar.services.indexation.TikaUtils; import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.Pair; @@ -380,22 +381,12 @@ public class DocumentsWebService extends CoselmarWebServiceSupport { String filePath = null; String fileContent = null; - // If document has a file, manager it ! + // If document has a file, manage it ! if (uploadFile != null) { Pair<String, String> pathAndContentType = managerDocumentFile(uploadFile, owner); filePath = pathAndContentType.getLeft(); contentType = pathAndContentType.getRight(); - try { - fileContent = getServicesContext().getLuceneUtils().getTika().parseToString(new File(filePath)); - } catch (IOException e) { - if (log.isErrorEnabled()) { - log.error("Unable to read uploaded file " + filePath, e); - } - } catch (TikaException e) { - if (log.isErrorEnabled()) { - log.error("Unable to get file content from Tika : " + filePath, e); - } - } + fileContent = TikaUtils.getFileContent(filePath); } // Document Metadata @@ -504,19 +495,8 @@ public class DocumentsWebService extends CoselmarWebServiceSupport { Pair<String, String> pathAndContentType = managerDocumentFile(uploadFile, owner); String filePath = pathAndContentType.getLeft(); String contentType = pathAndContentType.getRight(); - String fileContent = null; // Read file content - try { - fileContent = getServicesContext().getLuceneUtils().getTika().parseToString(new File(filePath)); - } catch (IOException e) { - if (log.isErrorEnabled()) { - log.error("Unable to read uploaded file " + filePath, e); - } - } catch (TikaException e) { - if (log.isErrorEnabled()) { - log.error("Unable to get file content from Tika : " + filePath, e); - } - } + String fileContent = TikaUtils.getFileContent(filePath); // If document has already a file, remove it if (StringUtils.isNotBlank(documentEntity.getFilePath())) { -- To stop receiving notification emails like this one, please contact codelutin.com SCM administrator <admin+scm@codelutin.com>.
participants (1)
-
codelutin.com scm