Skip to content

Commit

Permalink
Optimized text analysis
Browse files Browse the repository at this point in the history
  • Loading branch information
isc-lperezra committed Aug 21, 2024
1 parent 3e2f99a commit 0d29c5f
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 47 deletions.
6 changes: 3 additions & 3 deletions encoder-ui/src/app/analyzer/analyzer.component.ts
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ export class AnalyzerComponent {
this.diagnostics = [];
var textHTML = this.textToAnalyze?.value;
var textOriginal = textHTML;
var textToProcess = this.textToAnalyze?.value.split(".");
var forReading = 100/(textToProcess.length-1);
var textToProcess = this.textToAnalyze?.value.split(".").filter(Boolean);
var forReading = 100/(textToProcess.length);
this.totalReceived = 0;
this.error = false;
this.loading = true;
Expand All @@ -58,7 +58,7 @@ export class AnalyzerComponent {
"Text": textOriginal
};
this.irisService.saveRawText(rawText).subscribe({next: raw => {
this.totalReceived = 5
this.totalReceived = (100%(textToProcess.length)) + 1;
for (var index in textToProcess){
if (textToProcess[index] !== "")
{
Expand Down
8 changes: 1 addition & 7 deletions encoder-ui/src/app/codelist/codelist.component.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import { IrisService } from '../services/iris.service';
templateUrl: './codelist.component.html',
styleUrl: './codelist.component.scss'
})
export class CodelistComponent implements OnInit, OnChanges {
export class CodelistComponent implements OnChanges {

@Input() codeRequestId: string = "";
@Output() newCodeEvent = new EventEmitter<any>();
Expand All @@ -27,12 +27,6 @@ export class CodelistComponent implements OnInit, OnChanges {
this.loadCodes();
}
}

ngOnInit(): void {
if (this.codeRequestId !== "") {
this.loadCodes();
}
}

loadCodes(): void {
this.irisService.getCodeOptions(this.codeRequestId).subscribe({
Expand Down
66 changes: 30 additions & 36 deletions src/ENCODER/BP/AnalyzeTextProcess.cls
Original file line number Diff line number Diff line change
Expand Up @@ -31,45 +31,39 @@ Method AnalyzeText(text As %String, analysisId As %String, language As %String)

return " ".join(lexical_tokens)

model = sentence_transformers.SentenceTransformer('/shared/model/')
phrases = text.split(",")
for textToLematize in phrases :
if textToLematize != "":
phrase = lematizeText(textToLematize)
result = ""
words = phrase.split(" ")
totalWords = len(words)
try:
model = sentence_transformers.SentenceTransformer('/shared/model/')
phrases = text.split(",")
sqlsentence = ""
for textToLematize in phrases :
if textToLematize != "":
phrase = lematizeText(textToLematize)
result = ""
words = phrase.split(" ")
totalWords = len(words)

i = 0
texts = []
while i + 3 <= totalWords :
texts.append(words[i]+" "+words[i+1]+" "+words[i+2])
i = i + 1

if len(texts) == 0 :
texts.append(phrase)
i = 0
texts = []
while i + 3 <= totalWords :
texts.append(words[i]+" "+words[i+1]+" "+words[i+2])
i = i + 1
if len(texts) == 0 :
texts.append(phrase)

embeddings = model.encode(texts, normalize_embeddings=True)
embeddingList = embeddings.tolist()

i = 0
for text in texts:
stmt = iris.sql.prepare("SELECT TOP 50 * FROM (SELECT CodeId, Description, VECTOR_DOT_PRODUCT(VectorDescription, TO_VECTOR(?, DECIMAL)) AS Similarity FROM ENCODER_Object.Codes) WHERE Similarity > 0.8 ORDER BY Similarity DESC")
resultSet = stmt.execute(str(embeddingList[i]))
df = resultSet.dataframe()
if (df.size > 0):
for index, row in df.iterrows():
textMatch = iris.cls('ENCODER.Object.TextMatches')._New()
textMatch.RawText = text
textMatch.CodeId = str(row['codeid'])
textMatch.Description = str(row['description'])
textMatch.Similarity = str(row['similarity'])
textMatch.AnalysisId = analysisId
status = textMatch._Save()
result = result + "|"+ text+"_"+ str(row['codeid'])+":"+ str(row['description'])+":"+ str(row['similarity'])
i = i + 1
embeddings = model.encode(texts, normalize_embeddings=True)
embeddingList = embeddings.tolist()

i = 0
for text in texts:
sqlsentence = "INSERT INTO ENCODER_Object.TextMatches (CodeId, Description, Similarity, AnalysisId, RawText) SELECT TOP 50 * FROM (SELECT CodeId, Description, VECTOR_DOT_PRODUCT(VectorDescription, TO_VECTOR('"+str(embeddingList[i])+"', DECIMAL)) AS Similarity, '"+analysisId+"', '"+text+"' FROM ENCODER_Object.Codes) WHERE Similarity > 0.8 ORDER BY Similarity DESC"
utils = iris.cls("ENCODER.Utils.Manager").ExecuteInsertQuery(sqlsentence)

i = i + 1
except Exception as err:
return repr(err)

return result
return "Success"
}

Storage Default
Expand Down
2 changes: 1 addition & 1 deletion src/ENCODER/Object/TextMatches.cls
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ Property RawText As %String;

Property CodeId As %String;

Property Description As %String;
Property Description As %String(MAXLEN = 256);

Property Similarity As %Double;

Expand Down
19 changes: 19 additions & 0 deletions src/ENCODER/Utils/Manager.cls
Original file line number Diff line number Diff line change
@@ -1,6 +1,25 @@
Class ENCODER.Utils.Manager Extends %RegisteredObject
{

/// Description
ClassMethod ExecuteInsertQuery(sqlQuery As %String) As %String
{
Try {
set statement = ##class(%SQL.Statement).%New()
set status = statement.%Prepare(sqlQuery)
if ($$$ISOK(status)) {
set result = statement.%Execute()
return result.%SQLCODE
}
else {
return "0"
}
}
Catch (ex) {
return "0"
}
}

ClassMethod GetEncoding(sentence As %String) As %String [ Language = python ]
{
import sentence_transformers
Expand Down

0 comments on commit 0d29c5f

Please sign in to comment.