Optimized text analysis

intersystems-ib · Aug 21, 2024 · 0d29c5f · 0d29c5f
1 parent 3e2f99a
commit 0d29c5f
Show file tree

Hide file tree

Showing 5 changed files with 54 additions and 47 deletions.
diff --git a/encoder-ui/src/app/analyzer/analyzer.component.ts b/encoder-ui/src/app/analyzer/analyzer.component.ts
@@ -48,8 +48,8 @@ export class AnalyzerComponent {
     this.diagnostics = [];
     var textHTML = this.textToAnalyze?.value;
     var textOriginal = textHTML;
-    var textToProcess = this.textToAnalyze?.value.split(".");
-    var forReading = 100/(textToProcess.length-1);
+    var textToProcess = this.textToAnalyze?.value.split(".").filter(Boolean);
+    var forReading = 100/(textToProcess.length);
     this.totalReceived = 0;
     this.error = false;
     this.loading = true;
@@ -58,7 +58,7 @@ export class AnalyzerComponent {
       "Text": textOriginal
     };
     this.irisService.saveRawText(rawText).subscribe({next: raw => {
-      this.totalReceived = 5
+      this.totalReceived = (100%(textToProcess.length)) + 1;
       for (var index in textToProcess){
         if (textToProcess[index] !== "")
         {

diff --git a/encoder-ui/src/app/codelist/codelist.component.ts b/encoder-ui/src/app/codelist/codelist.component.ts
@@ -6,7 +6,7 @@ import { IrisService } from '../services/iris.service';
   templateUrl: './codelist.component.html',
   styleUrl: './codelist.component.scss'
 })
-export class CodelistComponent implements OnInit, OnChanges {
+export class CodelistComponent implements OnChanges {
 
   @Input() codeRequestId: string = "";
   @Output() newCodeEvent = new EventEmitter<any>();
@@ -27,12 +27,6 @@ export class CodelistComponent implements OnInit, OnChanges {
       this.loadCodes();
     }
   }
-
-  ngOnInit(): void {
-      if (this.codeRequestId !== "") {
-        this.loadCodes();
-      }
-  }
 
   loadCodes(): void {
     this.irisService.getCodeOptions(this.codeRequestId).subscribe({

diff --git a/src/ENCODER/BP/AnalyzeTextProcess.cls b/src/ENCODER/BP/AnalyzeTextProcess.cls
@@ -31,45 +31,39 @@ Method AnalyzeText(text As %String, analysisId As %String, language As %String)
 
         return " ".join(lexical_tokens)
 
-    model = sentence_transformers.SentenceTransformer('/shared/model/')
-    phrases = text.split(",")
-    for textToLematize in phrases :
-        if textToLematize != "":
-            phrase = lematizeText(textToLematize)
-            result = ""
-            words = phrase.split(" ")
-            totalWords = len(words)
+    try:
+        model = sentence_transformers.SentenceTransformer('/shared/model/')
+        phrases = text.split(",")
+        sqlsentence = ""
+        for textToLematize in phrases :
+            if textToLematize != "":
+                phrase = lematizeText(textToLematize)
+                result = ""
+                words = phrase.split(" ")
+                totalWords = len(words)
 
-            i = 0
-            texts = []
-            while i + 3 <= totalWords :
-                texts.append(words[i]+" "+words[i+1]+" "+words[i+2])
-                i = i + 1
-
-            if len(texts) == 0 :
-                texts.append(phrase)
+                i = 0
+                texts = []
+                while i + 3 <= totalWords :
+                    texts.append(words[i]+" "+words[i+1]+" "+words[i+2])
+                    i = i + 1
+                
+                if len(texts) == 0 :
+                    texts.append(phrase)
 
-            embeddings = model.encode(texts, normalize_embeddings=True)
-            embeddingList = embeddings.tolist()
-
-            i = 0
-            for text in texts:
-                stmt = iris.sql.prepare("SELECT TOP 50 * FROM (SELECT CodeId, Description, VECTOR_DOT_PRODUCT(VectorDescription, TO_VECTOR(?, DECIMAL)) AS Similarity FROM ENCODER_Object.Codes) WHERE Similarity > 0.8 ORDER BY Similarity DESC")
-                resultSet = stmt.execute(str(embeddingList[i]))
-                df = resultSet.dataframe()
-                if (df.size > 0):
-                    for index, row in df.iterrows():
-                        textMatch = iris.cls('ENCODER.Object.TextMatches')._New()
-                        textMatch.RawText = text
-                        textMatch.CodeId = str(row['codeid'])
-                        textMatch.Description = str(row['description'])
-                        textMatch.Similarity = str(row['similarity'])
-                        textMatch.AnalysisId = analysisId
-                        status = textMatch._Save()
-                        result = result + "|"+ text+"_"+ str(row['codeid'])+":"+ str(row['description'])+":"+ str(row['similarity'])
-                i = i + 1
+                embeddings = model.encode(texts, normalize_embeddings=True)
+                embeddingList = embeddings.tolist()
+
+                i = 0
+                for text in texts:
+                    sqlsentence = "INSERT INTO ENCODER_Object.TextMatches (CodeId, Description, Similarity, AnalysisId, RawText) SELECT TOP 50 * FROM (SELECT CodeId, Description, VECTOR_DOT_PRODUCT(VectorDescription, TO_VECTOR('"+str(embeddingList[i])+"', DECIMAL)) AS Similarity, '"+analysisId+"', '"+text+"' FROM ENCODER_Object.Codes) WHERE Similarity > 0.8 ORDER BY Similarity DESC"
+                    utils = iris.cls("ENCODER.Utils.Manager").ExecuteInsertQuery(sqlsentence)
+
+                    i = i + 1
+    except Exception as err:
+        return repr(err)
 
-    return result
+    return "Success"
 }
 
 Storage Default

diff --git a/src/ENCODER/Object/TextMatches.cls b/src/ENCODER/Object/TextMatches.cls
@@ -5,7 +5,7 @@ Property RawText As %String;
 
 Property CodeId As %String;
 
-Property Description As %String;
+Property Description As %String(MAXLEN = 256);
 
 Property Similarity As %Double;
 

diff --git a/src/ENCODER/Utils/Manager.cls b/src/ENCODER/Utils/Manager.cls
@@ -1,6 +1,25 @@
 Class ENCODER.Utils.Manager Extends %RegisteredObject
 {
 
+/// Description
+ClassMethod ExecuteInsertQuery(sqlQuery As %String) As %String
+{
+    Try {
+        set statement = ##class(%SQL.Statement).%New()
+        set status = statement.%Prepare(sqlQuery)
+        if ($$$ISOK(status)) {
+            set result = statement.%Execute()
+            return result.%SQLCODE
+        }
+        else {
+            return "0"
+        }
+    }
+    Catch (ex) {
+        return "0"
+    }
+}
+
 ClassMethod GetEncoding(sentence As %String) As %String [ Language = python ]
 {
         import sentence_transformers