From 730202a36cdcd00b8283b15e94ebac45b0a61096 Mon Sep 17 00:00:00 2001 From: A Happy Cat Date: Wed, 7 Jun 2017 12:04:12 +0200 Subject: [PATCH] force disabled ocr in document conversion and fixed issue with sockets --- app/api/documents/documentQueryBuilder.js | 102 -------- .../specs/documentsQueryBuilder.spec.js | 117 --------- .../specs/documentsQueryBuilder.spec.js | 247 ------------------ app/api/upload/PDF.js | 2 +- app/api/upload/routes.js | 1 - app/react/Metadata/components/UploadButton.js | 52 ++-- app/react/Uploads/components/UploadBox.js | 37 +-- app/react/socket.js | 8 + app/react/sockets.js | 3 +- 9 files changed, 41 insertions(+), 528 deletions(-) delete mode 100644 app/api/documents/documentQueryBuilder.js delete mode 100644 app/api/documents/specs/documentsQueryBuilder.spec.js delete mode 100644 app/api/search/specs/documentsQueryBuilder.spec.js create mode 100644 app/react/socket.js diff --git a/app/api/documents/documentQueryBuilder.js b/app/api/documents/documentQueryBuilder.js deleted file mode 100644 index 1aa117b88e..0000000000 --- a/app/api/documents/documentQueryBuilder.js +++ /dev/null @@ -1,102 +0,0 @@ - -export default function () { - let baseQuery = { - _source: { - include: [ 'doc.title', 'doc.processed', 'doc.creationDate', 'doc.template', 'doc.metadata'] - }, - from: 0, - size: 12, - query: { - match_all: {} - }, - sort: [], - filter: { - bool: { - must: [ - {match: {'doc.published': true}} - ] - } - } - }; - - return { - query() { - return baseQuery; - }, - - fullTextSearch(term, fieldsToSearch = ['doc.fullText', 'doc.metadata.*', 'doc.title']) { - if (term) { - baseQuery.query = { - multi_match: { - query: term, - type: 'phrase_prefix', - fields: fieldsToSearch - } - }; - } - return this; - }, - - sort(property, order = 'desc') { - let sort = {}; - sort[`doc.${property}`] = {order, ignore_unmapped: true}; - baseQuery.sort.push(sort); - return this; - }, - - filterMetadata(filters = {}) { - Object.keys(filters).forEach((property) => { - if (filters[property].type === 'text') { - let match = {}; - match[`doc.metadata.${property}`] = filters[property].value; - baseQuery.filter.bool.must.push({match}); - } - - if (filters[property].type === 'range') { - let range = {}; - range[`doc.metadata.${property}`] = {gte: filters[property].value.from, lte: filters[property].value.to}; - baseQuery.filter.bool.must.push({range}); - } - }); - return this; - }, - - filterByTemplate(templates = []) { - if (templates.length) { - let match = {bool: { - should: [], - minimum_should_match: 1 - }}; - - templates.forEach((templateId) => { - match.bool.should.push({match: {'doc.template': templateId}}); - }); - - baseQuery.filter.bool.must.push(match); - } - return this; - }, - - highlight(fields) { - baseQuery.highlight = { - pre_tags : [''], - post_tags : [''] - }; - baseQuery.highlight.fields = {}; - fields.forEach((field) => { - baseQuery.highlight.fields[field] = {}; - }); - return this; - }, - - from(from) { - baseQuery.from = from; - return this; - }, - - limit(size) { - baseQuery.size = size; - return this; - } - }; -} diff --git a/app/api/documents/specs/documentsQueryBuilder.spec.js b/app/api/documents/specs/documentsQueryBuilder.spec.js deleted file mode 100644 index ca2059112f..0000000000 --- a/app/api/documents/specs/documentsQueryBuilder.spec.js +++ /dev/null @@ -1,117 +0,0 @@ -import queryBuilder from 'api/documents/documentQueryBuilder'; - -describe('documentQueryBuilder', () => { - beforeEach(() => {}); - - describe('default query', () => { - it('should do a match all on published documents', () => { - expect(queryBuilder().query().query).toEqual({match_all: {}}); - expect(queryBuilder().query().filter.bool.must[0]).toEqual({match: {'doc.published': true}}); - }); - }); - - describe('from', () => { - it('should set from', () => { - expect(queryBuilder().from(5).query().from).toEqual(5); - }); - }); - - describe('limit', () => { - it('should set size', () => { - expect(queryBuilder().limit(55).query().size).toEqual(55); - }); - }); - - describe('filterMetadata', () => { - it('should add filter conditions', () => { - let query = queryBuilder().filterMetadata({property1: {value: 'value1', type: 'text'}, property2: {value: 'value2', type: 'text'}}).query(); - expect(query.filter.bool.must[0]).toEqual({match: {'doc.published': true}}); - expect(query.filter.bool.must[1]).toEqual({match: {'doc.metadata.property1': 'value1'}}); - expect(query.filter.bool.must[2]).toEqual({match: {'doc.metadata.property2': 'value2'}}); - }); - - it('should filter range filters', () => { - let query = queryBuilder().filterMetadata({property1: {value: {from: 10, to: 20}, type: 'range'}}).query(); - expect(query.filter.bool.must[0]).toEqual({match: {'doc.published': true}}); - expect(query.filter.bool.must[1]).toEqual({range: {'doc.metadata.property1': {gte: 10, lte: 20}}}); - }); - - describe('when there is no filters', () => { - it('should add filter conditions', () => { - let query = queryBuilder().filterMetadata().query(); - expect(query.filter.bool.must[0]).toEqual({match: {'doc.published': true}}); - expect(query.filter.bool.must.length).toBe(1); - }); - }); - }); - - describe('filterByTemplate', () => { - it('should add a match to get only documents that match with the templates', () => { - let query = queryBuilder().filterByTemplate(['template1', 'template2']).query(); - let expectedMatcher = { - bool: { - should: [ - {match: {'doc.template': 'template1'}}, - {match: {'doc.template': 'template2'}} - ], - minimum_should_match: 1 - } - }; - expect(query.filter.bool.must[1]).toEqual(expectedMatcher); - }); - }); - - describe('fullTextSearch', () => { - it('should do a multi_match on default fields', () => { - let query = queryBuilder().fullTextSearch('term').query(); - expect(query.query).toEqual({ - multi_match: { - query: 'term', - type: 'phrase_prefix', - fields: ['doc.fullText', 'doc.metadata.*', 'doc.title'] - } - }); - }); - - describe('when term is blank', () => { - it('should return the default match_all', () => { - let query = queryBuilder().fullTextSearch('').query(); - expect(query.query).toEqual({match_all: {}}); - }); - }); - - describe('sort', () => { - it('should add a sort property desc by default', () => { - let query = queryBuilder().sort('title').query(); - expect(query.sort[0]).toEqual({'doc.title': {order: 'desc', ignore_unmapped: true}}); - }); - it('should sort by order passed', () => { - let query = queryBuilder().sort('title', 'asc').query(); - expect(query.sort[0]).toEqual({'doc.title': {order: 'asc', ignore_unmapped: true}}); - }); - }); - - describe('when passing fields', () => { - it('should use them instead of the default ones', () => { - let query = queryBuilder().fullTextSearch('term', ['another.field']).query(); - expect(query.query).toEqual({ - multi_match: { - query: 'term', - type: 'phrase_prefix', - fields: ['another.field'] - } - }); - }); - }); - }); - - describe('highlights', () => { - it('should return a query with hilight configuration for the fields passed', () => { - let query = queryBuilder().highlight(['field1', 'field2']).query(); - expect(query.highlight.fields).toEqual({ - field1: {}, - field2: {} - }); - }); - }); -}); diff --git a/app/api/search/specs/documentsQueryBuilder.spec.js b/app/api/search/specs/documentsQueryBuilder.spec.js deleted file mode 100644 index 35d138b02e..0000000000 --- a/app/api/search/specs/documentsQueryBuilder.spec.js +++ /dev/null @@ -1,247 +0,0 @@ -/* eslint-disable camelcase */ -import queryBuilder from 'api/search/documentQueryBuilder'; - -xdescribe('documentQueryBuilder', () => { - beforeEach(() => {}); - - describe('default query', () => { - it('should do a match all on published documents', () => { - expect(queryBuilder().query().query.bool.must[0]).toEqual({match: {published: true}}); - }); - }); - - describe('unpublished', () => { - it('should do a match all on published documents', () => { - expect(queryBuilder().unpublished().query().query.bool.must[0]).toEqual({match: {published: false}}); - }); - }); - - describe('owner', () => { - it('should do a match all documents uploaded by a specific user', () => { - const user = {_id: '123'}; - expect(queryBuilder().owner(user).query().query.bool.must[1]).toEqual({match: {user: '123'}}); - }); - }); - - describe('from', () => { - it('should set from', () => { - expect(queryBuilder().from(5).query().from).toEqual(5); - }); - }); - - describe('limit', () => { - it('should set size', () => { - expect(queryBuilder().limit(55).query().size).toEqual(55); - }); - }); - - describe('language', () => { - it('should set language', () => { - let baseQuery = queryBuilder().language('es').query(); - expect(baseQuery.query.bool.must[1]).toEqual({match: {language: 'es'}}); - - baseQuery = queryBuilder().language('en').query(); - expect(baseQuery.query.bool.must[1]).toEqual({match: {language: 'en'}}); - }); - }); - - describe('includeUnpublished', () => { - it('should allow including unpulbished documents', () => { - let baseQuery = queryBuilder().includeUnpublished().query(); - expect(baseQuery.query.bool.must.length).toBe(0); - - baseQuery = queryBuilder().language('es').includeUnpublished().query(); - expect(baseQuery.query.bool.must[0]).toEqual({match: {language: 'es'}}); - }); - }); - - describe('filterMetadata', () => { - it('should add filter conditions', () => { - let baseQuery = queryBuilder().filterMetadata({ - property1: {value: 'value1', type: 'text'}, - property2: {value: 'value2', type: 'text'} - }).query(); - expect(baseQuery.query.bool.must[1]).toEqual({match: {'metadata.property1': 'value1'}}); - expect(baseQuery.query.bool.must[2]).toEqual({match: {'metadata.property2': 'value2'}}); - }); - - it('should filter range filters', () => { - let baseQuery = queryBuilder().filterMetadata({property1: {value: {from: 10, to: 20}, type: 'range'}}).query(); - expect(baseQuery.query.bool.must[1]).toEqual({range: {'metadata.property1': {gte: 10, lte: 20}}}); - }); - - it('should filter multiselect filters', () => { - let baseQuery = queryBuilder().filterMetadata({property1: {value: [23, 4, 16], type: 'multiselect'}}).query(); - expect(baseQuery.query.bool.must[1]).toEqual({terms: {'metadata.property1.raw': [23, 4, 16]}}); - }); - }); - - describe('filterByTemplate', () => { - it('should add a match to get only documents that match with the templates', () => { - let baseQuery = queryBuilder().filterByTemplate(['template1', 'template2']).query(); - let expectedMatcher = {terms: {template: ['template1', 'template2']}}; - expect(baseQuery.query.bool.must[1]).toEqual(expectedMatcher); - }); - }); - - describe('filterById', () => { - it('should add a match to get only documents that match with the passed ids', () => { - let baseQuery = queryBuilder().filterById(['id1', 'id2']).query(); - let expectedMatcher = {terms: {'sharedId.raw': ['id1', 'id2']}}; - //expect(baseQuery.filter.bool.must[0]).toEqual(expectedMatcher); - expect(baseQuery.query.bool.must[1]).toEqual(expectedMatcher); - }); - - describe('when id is a single value', () => { - it('should add it to an array', () => { - let baseQuery = queryBuilder().filterById('id').query(); - let expectedMatcher = {terms: {'sharedId.raw': ['id']}}; - expect(baseQuery.query.bool.must[1]).toEqual(expectedMatcher); - }); - }); - }); - - describe('aggregations', () => { - it('default aggregations should contain types', () => { - let baseQuery = queryBuilder().query(); - let typesAggregation = { - terms: { - field: 'template.raw', - missing: 'missing', - size: 9999 - }, - aggregations: { - filtered: { - filter: { - bool: { - must: [] - } - } - } - } - }; - expect(baseQuery.aggregations.types).toEqual(typesAggregation); - }); - - it('should add aggregations to the query with the current filters', () => { - let baseQuery = queryBuilder().aggregations([{name: 'property1'}, {name: 'property2'}]).query(); - let property1Aggregation = { - terms: { - field: 'metadata.property1.raw', - size: 9999 - }, - aggregations: { - filtered: { - filter: { - bool: { - must: [{match: {published: true}}] - } - } - } - } - }; - - expect(baseQuery.aggregations.property1).toEqual(property1Aggregation); - }); - }); - - describe('fullTextSearch', () => { - it('should do a multi_match on default fields', () => { - let baseQuery = queryBuilder().fullTextSearch('term').query(); - expect(baseQuery.query.bool.must[1]).toEqual( - { - bool: { - should: [ - { - has_child: { - type: 'fullText', - score_mode: 'max', - inner_hits: { - _source: false, - highlight: { - pre_tags: [''], - post_tags: [''], - fields: { - fullText: {number_of_fragments: 10} - } - } - }, - query: { - multi_match: { - query: 'term', - type: 'phrase_prefix', - fields: 'fullText' - } - } - } - }, - { - multi_match: { - query: 'term', - type: 'phrase_prefix', - fields: ['title'] - } - } - ] - } - } - ); - }); - - describe('when fieldsToSearch is empty', () => { - it('shoud not include the multi_match', () => { - let baseQuery = queryBuilder().fullTextSearch('term', [], false).query(); - expect(baseQuery.query.bool.must[1]).toEqual( - { - bool: { - should: [ - ] - } - } - ); - }); - }); - - describe('when includeFullText = false', () => { - it('should only search on the document by fieldsToSearch', () => { - let baseQuery = queryBuilder().fullTextSearch('term', ['field1', 'field2'], false).query(); - expect(baseQuery.query.bool.must[1]).toEqual( - { - bool: { - should: [ - { - multi_match: { - query: 'term', - type: 'phrase_prefix', - fields: ['field1', 'field2'] - } - } - ] - } - } - ); - }); - }); - - describe('sort', () => { - it('should add a sort property desc by default', () => { - let baseQuery = queryBuilder().sort('title').query(); - expect(baseQuery.sort[0]).toEqual({'title.raw': {order: 'desc', unmapped_type: 'boolean'}}); - }); - it('should sort by order passed', () => { - let baseQuery = queryBuilder().sort('title', 'asc').query(); - expect(baseQuery.sort[0]).toEqual({'title.raw': {order: 'asc', unmapped_type: 'boolean'}}); - }); - }); - }); - - describe('highlights', () => { - it('should return a query with hilight configuration for the fields passed', () => { - let baseQuery = queryBuilder().highlight(['field1', 'field2']).query(); - expect(baseQuery.highlight.fields).toEqual({ - field1: {}, - field2: {} - }); - }); - }); -}); diff --git a/app/api/upload/PDF.js b/app/api/upload/PDF.js index e1f842090c..0750bfece2 100644 --- a/app/api/upload/PDF.js +++ b/app/api/upload/PDF.js @@ -22,7 +22,7 @@ export default class PDF extends EventEmitter { extractText() { let logFile = fs.createWriteStream(this.logFile, {flags: 'a'}); let tmpPath = '/tmp/' + Date.now() + 'docsplit/'; - let options = ['text', '-o', tmpPath, this.filepath]; + let options = ['text', '--no-ocr', '-o', tmpPath, this.filepath]; let extraction = spawn('docsplit', options); extraction.stderr.pipe(logFile); extraction.stdout.pipe(logFile); diff --git a/app/api/upload/routes.js b/app/api/upload/routes.js index 78face3141..6023dcbdfe 100644 --- a/app/api/upload/routes.js +++ b/app/api/upload/routes.js @@ -34,7 +34,6 @@ export default (app) => { const docs = _docs.map((doc) => { doc.file = req.files[0]; doc.uploaded = true; - doc.processed = false; return doc; }); return entities.saveMultiple(docs); diff --git a/app/react/Metadata/components/UploadButton.js b/app/react/Metadata/components/UploadButton.js index 7c4bde1b3a..487806ffd6 100644 --- a/app/react/Metadata/components/UploadButton.js +++ b/app/react/Metadata/components/UploadButton.js @@ -3,7 +3,7 @@ import React, {Component} from 'react'; import {connect} from 'react-redux'; import {bindActionCreators} from 'redux'; import {reuploadDocument} from 'app/Metadata/actions/actions'; -import io from 'socket.io-client'; +import socket from 'app/socket'; export class UploadButton extends Component { @@ -11,34 +11,20 @@ export class UploadButton extends Component { super(props, context); this.state = {processing: false, failed: false, completed: false}; - } - onChange(e) { - let file = e.target.files[0]; - this.context.confirm({ - accept: () => { - this.props.reuploadDocument(this.props.documentId, file, this.props.documentSharedId); - }, - title: 'Confirm upload', - message: 'Are you sure you want to upload a new document?\n\n' + - 'All Table of Contents (TOC) and all text-based references linked to the previous document will be lost.' + socket.on('conversionStart', (docId) => { + if (docId === this.props.documentId) { + this.setState({processing: true, failed: false, completed: false}); + } }); - } - - componentWillMount() { - //only on client - if (!window.document) { - return; - } - this.socket = io(); - this.socket.on('conversionStart', (docId) => { + socket.on('conversionFailed', (docId) => { if (docId === this.props.documentId) { - this.setState({processing: true, failed: false, completed: false}); + this.setState({processing: false, failed: true, completed: false}); } }); - this.socket.on('documentProcessed', (docId) => { + socket.on('documentProcessed', (docId) => { if (docId === this.props.documentId) { this.setState({processing: false, failed: false, completed: true}, () => { setTimeout(() => { @@ -47,20 +33,18 @@ export class UploadButton extends Component { }); } }); - - this.socket.on('conversionFailed', (docId) => { - if (docId === this.props.documentId) { - this.setState({processing: false, failed: true, completed: false}); - } - }); } - componentWillUnmount() { - //only on client - if (!window.document) { - return; - } - this.socket.disconnect(); + onChange(e) { + let file = e.target.files[0]; + this.context.confirm({ + accept: () => { + this.props.reuploadDocument(this.props.documentId, file, this.props.documentSharedId); + }, + title: 'Confirm upload', + message: 'Are you sure you want to upload a new document?\n\n' + + 'All Table of Contents (TOC) and all text-based references linked to the previous document will be lost.' + }); } renderUploadButton() { diff --git a/app/react/Uploads/components/UploadBox.js b/app/react/Uploads/components/UploadBox.js index c4060e555f..cadd8dd8fa 100644 --- a/app/react/Uploads/components/UploadBox.js +++ b/app/react/Uploads/components/UploadBox.js @@ -7,9 +7,21 @@ import {wrapDispatch} from 'app/Multireducer'; import {uploadDocument, createDocument, documentProcessed, documentProcessError} from 'app/Uploads/actions/uploadsActions'; import {unselectAllDocuments} from 'app/Library/actions/libraryActions'; -import io from 'socket.io-client'; +import socket from 'app/socket'; export class UploadBox extends Component { + + constructor(props) { + super(props); + socket.on('documentProcessed', (sharedId) => { + this.props.documentProcessed(sharedId); + }); + + socket.on('conversionFailed', (sharedId) => { + this.props.documentProcessError(sharedId); + }); + } + onDrop(files) { files.forEach((file) => { let doc = {title: this.extractTitle(file)}; @@ -21,29 +33,6 @@ export class UploadBox extends Component { this.props.unselectAllDocuments(); } - componentWillMount() { - //only on client - if (!window.document) { - return; - } - this.socket = io(); - this.socket.on('documentProcessed', (sharedId) => { - this.props.documentProcessed(sharedId); - }); - - this.socket.on('conversionFailed', (sharedId) => { - this.props.documentProcessError(sharedId); - }); - } - - componentWillUnmount() { - //only on client - if (!window.document) { - return; - } - this.socket.disconnect(); - } - extractTitle(file) { let title = file.name .replace(/\.[^/.]+$/, '') diff --git a/app/react/socket.js b/app/react/socket.js new file mode 100644 index 0000000000..379a687e89 --- /dev/null +++ b/app/react/socket.js @@ -0,0 +1,8 @@ +import io from 'socket.io-client'; +import {isClient} from 'app/utils'; +let socket = {on: () =>{}}; +if (isClient) { + socket = io(); +} + +export default socket; diff --git a/app/react/sockets.js b/app/react/sockets.js index b2234270f6..f6eb37da95 100644 --- a/app/react/sockets.js +++ b/app/react/sockets.js @@ -1,7 +1,6 @@ import {store} from './store'; import {actions} from 'app/BasicReducer'; -import io from 'socket.io-client'; -const socket = io(); +import socket from './socket'; socket.on('templateChange', (template) => { store.dispatch(actions.update('templates', template));