Merge remote-tracking branch 'origin/develop'

oaworks · Nov 25, 2022 · 98a73d5 · 98a73d5
2 parents a408547 + 6ebc3e2
commit 98a73d5
Show file tree

Hide file tree

Showing 5 changed files with 83 additions and 37 deletions.
diff --git a/server/src/utilities/convert.coffee b/server/src/utilities/convert.coffee
@@ -3,21 +3,10 @@ P.convert ?= {}
 
 # apt-get install pdftk poppler-utils antiword unoconv #ghostscript tesseract-ocr
 
-P.convert.docxtest = () -> # this works but needs some cleaning up
-  cn = await @_child 'unzip',  ['-p', '/home/cloo/static/ExtendedInterval.docx', 'word/document.xml']
-  content = ''
-  first = true
-  for s in cn.split '<w:t'
-    if not first
-      content += ' ' if not content.endsWith ' '
-      content += s.split('>')[1].split('</w:t')[0].replace(/\<.*?\>/g, '')
-    first = false
-  return content
-
 P.convert._content2file = (content) ->
   if not content? and (@params.url or @params.content)
     pc = @params.url ? @params.content
-    if pc.startsWith('http://') or (not pc.startsWith('/') and not pc.includes '../')
+    if pc.startsWith('http://') or pc.startsWith('https://') or (not pc.startsWith('/') and not pc.includes '../')
       content = pc
   if typeof content is 'string' and content.startsWith 'http'
     content = await @fetch content, buffer: true
@@ -36,14 +25,28 @@ P.convert.doc2txt = (content) ->
   await fs.unlink(cn) if typeof cn is 'string' and cn.startsWith '/tmp/'
   return txt
 
-P.convert.docx2txt = (content) ->
+'''P.convert.docx2txt = (content) ->
   cn = await @convert._content2file content
   await @_child 'unoconv', ['-f', 'doc', cn] # unoconv also has a convenience command set up called doc2pdf
   await fs.unlink(cn) if cn.startsWith '/tmp/'
   cn = cn.replace('.docx', '') + '.doc'
   txt = await @convert.doc2txt cn
   try await fs.unlink(cn) if typeof cn is 'string' and cn.startsWith '/tmp/'
-  return txt
+  return txt'''
+
+P.convert.docx2txt = (content) ->
+  cn = await @convert._content2file content
+  #cn = '/home/cloo/static/ExtendedInterval.docx'
+  content = await @_child 'unzip',  ['-p', cn, 'word/document.xml']
+  await fs.unlink(cn) if typeof cn is 'string' and cn.startsWith '/tmp/'
+  txt = ''
+  first = true
+  for s in content.split '<w:t'
+    if not first
+      txt += ' ' if not txt.endsWith ' '
+      txt += s.split('>')[1].split('</w:t')[0].replace(/\<.*?\>/g, '')
+    first = false
+  return txt.replace /\<w\:.*?\/ /g, '' # remove things like table formatting from main body text, but leave table content
 
 P.convert.pdf2txt = (content) ->
   cn = await @convert._content2file content

diff --git a/worker/dist/worker.js b/worker/dist/worker.js
@@ -951,7 +951,7 @@ P._wrapper = function(f, n) { // the function to wrap and the string name of the
                   });
                 }
                 _makecsv = async(rt, qry, out, keys, notify, eurl, pfs) => {
-                  var awards, blfl, blp, blr, first, funder, key, len3, len4, len5, len6, names, nar, o, q, ref13, ref14, ref15, s, st, u, val;
+                  var awards, blfl, bljnd, blp, blr, bn, first, funder, key, len3, len4, len5, len6, len7, names, nar, o, q, ref13, ref14, ref15, ref16, s, st, u, v, val;
                   first = true;
                   if (pfs) {
                     keys = ['DOI', 'funder.name', 'funder.award'];
@@ -1027,7 +1027,18 @@ P._wrapper = function(f, n) { // the function to wrap and the string name of the
                           val = '';
                         } else if (typeof blfl[k] === 'object') {
                           if (Array.isArray(blfl[k])) {
-                            blfl[k] = blfl[k].join(';');
+                            bljnd = '';
+                            ref16 = blfl[k];
+                            for (v = 0, len7 = ref16.length; v < len7; v++) {
+                              bn = ref16[v];
+                              if (typeof bn === 'object') {
+                                bn = JSON.stringify(bn);
+                              }
+                              if (!bljnd.includes(bn)) { // Joe doesn't want duplicates kept
+                                bljnd += (bljnd ? ';' : '') + bn;
+                              }
+                            }
+                            blfl[k] = bljnd;
                           }
                           val = JSON.stringify(blfl[k]);
                         } else {
@@ -4880,19 +4891,36 @@ P.report = function() {
 };
 
 P.report.fixtypes = async function() {
-  var checked, cr, fixed, ref, rr;
+  var checked, cr, fixed, fr, ol, ref, ref1, ref2, rr, titled, tr;
   checked = 0;
   fixed = 0;
-  ref = this.index._for((this.S.dev ? 'paradigm_b_' : 'paradigm_') + 'report_works', 'NOT type.keyword:"journal-article" AND NOT type.keyword:"posted-content"', {
-    scroll: '30m',
-    include: ['DOI', 'type']
+  titled = 0;
+  ref = this.index._for((this.S.dev ? 'paradigm_b_' : 'paradigm_') + 'report_works', 'NOT title:*', {
+    scroll: '30m'
   });
-  for await (rr of ref) {
+  for await (tr of ref) {
+    if (tr.DOI.startsWith('10.')) {
+      titled += 1;
+      cr = (await this.src.crossref.works(tr.DOI));
+      ol = (await this.src.openalex.works('ids.doi.keyword:"https://doi.org/' + tr.DOI + '"', 1));
+      fr = (await this.report.works._process(cr, ol));
+      await this.report.works(fr);
+    }
+    console.log('fixing report works types titles', titled);
+  }
+  ref1 = this.index._for((this.S.dev ? 'paradigm_b_' : 'paradigm_') + 'report_works', '(NOT type.keyword:"journal-article" AND NOT type.keyword:"posted-content")', {
+    scroll: '30m'
+  });
+  for await (rr of ref1) {
     checked += 1;
-    if (cr = (await this.src.crossref.works(cr.DOI))) {
-      if (cr.type !== rr.type) {
+    if (rr.DOI.startsWith('10.')) {
+      cr = (await this.src.crossref.works(rr.DOI));
+      if (cr == null) {
+        ol = (await this.src.openalex.works('ids.doi.keyword:"https://doi.org/' + rr.DOI + '"', 1));
+      }
+      if (((cr != null) || (ol != null)) && ((cr != null ? cr.type : void 0) !== rr.type || (ol != null ? ol.type : void 0) !== rr.type)) {
         fixed += 1;
-        rr.type = cr.type;
+        rr.type = (ref2 = cr != null ? cr.type : void 0) != null ? ref2 : ol.type;
         await this.report.works(rr);
       }
     }
@@ -4901,7 +4929,7 @@ P.report.fixtypes = async function() {
   this.mail({
     to: ['mark@oa.works'],
     subject: 'OA report works types fixed ' + fixed,
-    text: checked + ' checked and fixed ' + fixed
+    text: checked + ' checked and fixed ' + fixed + ' and reprocessed ' + titled
   });
   return fixed;
 };
@@ -5561,7 +5589,7 @@ P.report.works.load = async function(timestamp, crossref, openalex, supplement,
     batch = [];
   }
   if (qry == null) {
-    qry = '(type.keyword:"journal-article" OR type.keyword:"posted-content") AND (funder.name:* OR author.affiliation.name:*) AND year.keyword:' + year;
+    qry = '(funder.name:* OR author.affiliation.name:*) AND year.keyword:' + year;
   }
   if (year && !qry.includes(':' + year)) {
     qry = '(' + qry + ') AND year.keyword:' + year;
@@ -14356,9 +14384,7 @@ P.decode = async function(content) {
 };
 
 
-S.built = "Fri Nov 18 2022 07:39:53 GMT+0000";
-P.convert.docxtest = {_bg: true}// added by constructor
-
+S.built = "Thu Nov 24 2022 06:34:05 GMT+0000";
 P.convert.doc2txt = {_bg: true}// added by constructor
 
 P.convert.docx2txt = {_bg: true}// added by constructor

diff --git a/worker/dist/worker.min.js b/worker/dist/worker.min.js
diff --git a/worker/src/api.coffee b/worker/src/api.coffee
@@ -611,7 +611,12 @@ P._wrapper = (f, n) -> # the function to wrap and the string name of the functio
                         if not blfl? or not blfl[k]?
                           val = ''
                         else if typeof blfl[k] is 'object'
-                          blfl[k] = blfl[k].join(';') if Array.isArray blfl[k]
+                          if Array.isArray blfl[k]
+                            bljnd = ''
+                            for bn in blfl[k]
+                              bn = JSON.stringify(bn) if typeof bn is 'object'
+                              bljnd += (if bljnd then ';' else '') + bn if not bljnd.includes bn # Joe doesn't want duplicates kept
+                            blfl[k] = bljnd
                           val = JSON.stringify blfl[k]
                         else
                           val = blfl[k]

diff --git a/worker/src/report.coffee b/worker/src/report.coffee
@@ -4,18 +4,30 @@ P.report = () -> return 'OA.Works report'
 P.report.fixtypes = () ->
   checked = 0
   fixed = 0
-  for await rr from @index._for (if @S.dev then 'paradigm_b_' else 'paradigm_') + 'report_works', 'NOT type.keyword:"journal-article" AND NOT type.keyword:"posted-content"', scroll: '30m', include: ['DOI', 'type']
+  titled = 0
+  for await tr from @index._for (if @S.dev then 'paradigm_b_' else 'paradigm_') + 'report_works', 'NOT title:*', scroll: '30m'
+    if tr.DOI.startsWith '10.'
+      titled += 1
+      cr = await @src.crossref.works tr.DOI
+      ol = await @src.openalex.works 'ids.doi.keyword:"https://doi.org/' + tr.DOI + '"', 1
+      fr = await @report.works._process cr, ol
+      await @report.works fr
+    console.log 'fixing report works types titles', titled
+  for await rr from @index._for (if @S.dev then 'paradigm_b_' else 'paradigm_') + 'report_works', '(NOT type.keyword:"journal-article" AND NOT type.keyword:"posted-content")', scroll: '30m'
     checked += 1
-    if cr = await @src.crossref.works cr.DOI
-      if cr.type isnt rr.type
+    if rr.DOI.startsWith '10.'
+      cr = await @src.crossref.works rr.DOI
+      if not cr?
+        ol = await @src.openalex.works 'ids.doi.keyword:"https://doi.org/' + rr.DOI + '"', 1
+      if (cr? or ol?) and (cr?.type isnt rr.type or ol?.type isnt rr.type)
         fixed += 1
-        rr.type = cr.type
+        rr.type = cr?.type ? ol.type
         await @report.works rr
     console.log 'fixing report works types', checked, fixed
   @mail
     to: ['mark@oa.works']
     subject: 'OA report works types fixed ' + fixed
-    text: checked + ' checked and fixed ' + fixed
+    text: checked + ' checked and fixed ' + fixed + ' and reprocessed ' + titled
   return fixed
 P.report.fixtypes._async = true
 P.report.fixtypes._bg = true
@@ -381,7 +393,7 @@ P.report.works.load = (timestamp, crossref, openalex, supplement, qry, oaqry, no
     await @report.works batch
     batch = []
 
-  qry ?= '(type.keyword:"journal-article" OR type.keyword:"posted-content") AND (funder.name:* OR author.affiliation.name:*) AND year.keyword:' + year
+  qry ?= '(funder.name:* OR author.affiliation.name:*) AND year.keyword:' + year
   qry = '(' + qry + ') AND year.keyword:' + year if year and not qry.includes ':' + year
   qry = '(' + qry + ') AND srcday:>' + timestamp if timestamp and not qry.includes ':>' + timestamp
   console.log qry