Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
Mark MacGillivray committed Nov 25, 2022
2 parents a408547 + 6ebc3e2 commit 98a73d5
Show file tree
Hide file tree
Showing 5 changed files with 83 additions and 37 deletions.
31 changes: 17 additions & 14 deletions server/src/utilities/convert.coffee
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,10 @@ P.convert ?= {}

# apt-get install pdftk poppler-utils antiword unoconv #ghostscript tesseract-ocr

P.convert.docxtest = () -> # this works but needs some cleaning up
cn = await @_child 'unzip', ['-p', '/home/cloo/static/ExtendedInterval.docx', 'word/document.xml']
content = ''
first = true
for s in cn.split '<w:t'
if not first
content += ' ' if not content.endsWith ' '
content += s.split('>')[1].split('</w:t')[0].replace(/\<.*?\>/g, '')
first = false
return content

P.convert._content2file = (content) ->
if not content? and (@params.url or @params.content)
pc = @params.url ? @params.content
if pc.startsWith('http://') or (not pc.startsWith('/') and not pc.includes '../')
if pc.startsWith('http://') or pc.startsWith('https://') or (not pc.startsWith('/') and not pc.includes '../')
content = pc
if typeof content is 'string' and content.startsWith 'http'
content = await @fetch content, buffer: true
Expand All @@ -36,14 +25,28 @@ P.convert.doc2txt = (content) ->
await fs.unlink(cn) if typeof cn is 'string' and cn.startsWith '/tmp/'
return txt

P.convert.docx2txt = (content) ->
'''P.convert.docx2txt = (content) ->
cn = await @convert._content2file content
await @_child 'unoconv', ['-f', 'doc', cn] # unoconv also has a convenience command set up called doc2pdf
await fs.unlink(cn) if cn.startsWith '/tmp/'
cn = cn.replace('.docx', '') + '.doc'
txt = await @convert.doc2txt cn
try await fs.unlink(cn) if typeof cn is 'string' and cn.startsWith '/tmp/'
return txt
return txt'''

P.convert.docx2txt = (content) ->
cn = await @convert._content2file content
#cn = '/home/cloo/static/ExtendedInterval.docx'
content = await @_child 'unzip', ['-p', cn, 'word/document.xml']
await fs.unlink(cn) if typeof cn is 'string' and cn.startsWith '/tmp/'
txt = ''
first = true
for s in content.split '<w:t'
if not first
txt += ' ' if not txt.endsWith ' '
txt += s.split('>')[1].split('</w:t')[0].replace(/\<.*?\>/g, '')
first = false
return txt.replace /\<w\:.*?\/ /g, '' # remove things like table formatting from main body text, but leave table content

P.convert.pdf2txt = (content) ->
cn = await @convert._content2file content
Expand Down
56 changes: 41 additions & 15 deletions worker/dist/worker.js
Original file line number Diff line number Diff line change
Expand Up @@ -951,7 +951,7 @@ P._wrapper = function(f, n) { // the function to wrap and the string name of the
});
}
_makecsv = async(rt, qry, out, keys, notify, eurl, pfs) => {
var awards, blfl, blp, blr, first, funder, key, len3, len4, len5, len6, names, nar, o, q, ref13, ref14, ref15, s, st, u, val;
var awards, blfl, bljnd, blp, blr, bn, first, funder, key, len3, len4, len5, len6, len7, names, nar, o, q, ref13, ref14, ref15, ref16, s, st, u, v, val;
first = true;
if (pfs) {
keys = ['DOI', 'funder.name', 'funder.award'];
Expand Down Expand Up @@ -1027,7 +1027,18 @@ P._wrapper = function(f, n) { // the function to wrap and the string name of the
val = '';
} else if (typeof blfl[k] === 'object') {
if (Array.isArray(blfl[k])) {
blfl[k] = blfl[k].join(';');
bljnd = '';
ref16 = blfl[k];
for (v = 0, len7 = ref16.length; v < len7; v++) {
bn = ref16[v];
if (typeof bn === 'object') {
bn = JSON.stringify(bn);
}
if (!bljnd.includes(bn)) { // Joe doesn't want duplicates kept
bljnd += (bljnd ? ';' : '') + bn;
}
}
blfl[k] = bljnd;
}
val = JSON.stringify(blfl[k]);
} else {
Expand Down Expand Up @@ -4880,19 +4891,36 @@ P.report = function() {
};

P.report.fixtypes = async function() {
var checked, cr, fixed, ref, rr;
var checked, cr, fixed, fr, ol, ref, ref1, ref2, rr, titled, tr;
checked = 0;
fixed = 0;
ref = this.index._for((this.S.dev ? 'paradigm_b_' : 'paradigm_') + 'report_works', 'NOT type.keyword:"journal-article" AND NOT type.keyword:"posted-content"', {
scroll: '30m',
include: ['DOI', 'type']
titled = 0;
ref = this.index._for((this.S.dev ? 'paradigm_b_' : 'paradigm_') + 'report_works', 'NOT title:*', {
scroll: '30m'
});
for await (rr of ref) {
for await (tr of ref) {
if (tr.DOI.startsWith('10.')) {
titled += 1;
cr = (await this.src.crossref.works(tr.DOI));
ol = (await this.src.openalex.works('ids.doi.keyword:"https://doi.org/' + tr.DOI + '"', 1));
fr = (await this.report.works._process(cr, ol));
await this.report.works(fr);
}
console.log('fixing report works types titles', titled);
}
ref1 = this.index._for((this.S.dev ? 'paradigm_b_' : 'paradigm_') + 'report_works', '(NOT type.keyword:"journal-article" AND NOT type.keyword:"posted-content")', {
scroll: '30m'
});
for await (rr of ref1) {
checked += 1;
if (cr = (await this.src.crossref.works(cr.DOI))) {
if (cr.type !== rr.type) {
if (rr.DOI.startsWith('10.')) {
cr = (await this.src.crossref.works(rr.DOI));
if (cr == null) {
ol = (await this.src.openalex.works('ids.doi.keyword:"https://doi.org/' + rr.DOI + '"', 1));
}
if (((cr != null) || (ol != null)) && ((cr != null ? cr.type : void 0) !== rr.type || (ol != null ? ol.type : void 0) !== rr.type)) {
fixed += 1;
rr.type = cr.type;
rr.type = (ref2 = cr != null ? cr.type : void 0) != null ? ref2 : ol.type;
await this.report.works(rr);
}
}
Expand All @@ -4901,7 +4929,7 @@ P.report.fixtypes = async function() {
this.mail({
to: ['mark@oa.works'],
subject: 'OA report works types fixed ' + fixed,
text: checked + ' checked and fixed ' + fixed
text: checked + ' checked and fixed ' + fixed + ' and reprocessed ' + titled
});
return fixed;
};
Expand Down Expand Up @@ -5561,7 +5589,7 @@ P.report.works.load = async function(timestamp, crossref, openalex, supplement,
batch = [];
}
if (qry == null) {
qry = '(type.keyword:"journal-article" OR type.keyword:"posted-content") AND (funder.name:* OR author.affiliation.name:*) AND year.keyword:' + year;
qry = '(funder.name:* OR author.affiliation.name:*) AND year.keyword:' + year;
}
if (year && !qry.includes(':' + year)) {
qry = '(' + qry + ') AND year.keyword:' + year;
Expand Down Expand Up @@ -14356,9 +14384,7 @@ P.decode = async function(content) {
};


S.built = "Fri Nov 18 2022 07:39:53 GMT+0000";
P.convert.docxtest = {_bg: true}// added by constructor

S.built = "Thu Nov 24 2022 06:34:05 GMT+0000";
P.convert.doc2txt = {_bg: true}// added by constructor

P.convert.docx2txt = {_bg: true}// added by constructor
Expand Down
2 changes: 1 addition & 1 deletion worker/dist/worker.min.js

Large diffs are not rendered by default.

7 changes: 6 additions & 1 deletion worker/src/api.coffee
Original file line number Diff line number Diff line change
Expand Up @@ -611,7 +611,12 @@ P._wrapper = (f, n) -> # the function to wrap and the string name of the functio
if not blfl? or not blfl[k]?
val = ''
else if typeof blfl[k] is 'object'
blfl[k] = blfl[k].join(';') if Array.isArray blfl[k]
if Array.isArray blfl[k]
bljnd = ''
for bn in blfl[k]
bn = JSON.stringify(bn) if typeof bn is 'object'
bljnd += (if bljnd then ';' else '') + bn if not bljnd.includes bn # Joe doesn't want duplicates kept
blfl[k] = bljnd
val = JSON.stringify blfl[k]
else
val = blfl[k]
Expand Down
24 changes: 18 additions & 6 deletions worker/src/report.coffee
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,30 @@ P.report = () -> return 'OA.Works report'
P.report.fixtypes = () ->
checked = 0
fixed = 0
for await rr from @index._for (if @S.dev then 'paradigm_b_' else 'paradigm_') + 'report_works', 'NOT type.keyword:"journal-article" AND NOT type.keyword:"posted-content"', scroll: '30m', include: ['DOI', 'type']
titled = 0
for await tr from @index._for (if @S.dev then 'paradigm_b_' else 'paradigm_') + 'report_works', 'NOT title:*', scroll: '30m'
if tr.DOI.startsWith '10.'
titled += 1
cr = await @src.crossref.works tr.DOI
ol = await @src.openalex.works 'ids.doi.keyword:"https://doi.org/' + tr.DOI + '"', 1
fr = await @report.works._process cr, ol
await @report.works fr
console.log 'fixing report works types titles', titled
for await rr from @index._for (if @S.dev then 'paradigm_b_' else 'paradigm_') + 'report_works', '(NOT type.keyword:"journal-article" AND NOT type.keyword:"posted-content")', scroll: '30m'
checked += 1
if cr = await @src.crossref.works cr.DOI
if cr.type isnt rr.type
if rr.DOI.startsWith '10.'
cr = await @src.crossref.works rr.DOI
if not cr?
ol = await @src.openalex.works 'ids.doi.keyword:"https://doi.org/' + rr.DOI + '"', 1
if (cr? or ol?) and (cr?.type isnt rr.type or ol?.type isnt rr.type)
fixed += 1
rr.type = cr.type
rr.type = cr?.type ? ol.type
await @report.works rr
console.log 'fixing report works types', checked, fixed
@mail
to: ['mark@oa.works']
subject: 'OA report works types fixed ' + fixed
text: checked + ' checked and fixed ' + fixed
text: checked + ' checked and fixed ' + fixed + ' and reprocessed ' + titled
return fixed
P.report.fixtypes._async = true
P.report.fixtypes._bg = true
Expand Down Expand Up @@ -381,7 +393,7 @@ P.report.works.load = (timestamp, crossref, openalex, supplement, qry, oaqry, no
await @report.works batch
batch = []

qry ?= '(type.keyword:"journal-article" OR type.keyword:"posted-content") AND (funder.name:* OR author.affiliation.name:*) AND year.keyword:' + year
qry ?= '(funder.name:* OR author.affiliation.name:*) AND year.keyword:' + year
qry = '(' + qry + ') AND year.keyword:' + year if year and not qry.includes ':' + year
qry = '(' + qry + ') AND srcday:>' + timestamp if timestamp and not qry.includes ':>' + timestamp
console.log qry
Expand Down

0 comments on commit 98a73d5

Please sign in to comment.