Skip to content

Commit 75f3fe6

Browse files
committed
Minor updates
1 parent 33e01ca commit 75f3fe6

File tree

2 files changed

+13
-19
lines changed

2 files changed

+13
-19
lines changed

src/ENCODER/BP/AnalyzeTextProcess.cls

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -16,49 +16,42 @@ Method AnalyzeText(text As %String, analysisId As %String, language As %String)
1616
import iris
1717
import spacy
1818

19-
def lematizeText(textToLematize):
20-
19+
try:
2120
lematize = ''
2221
if language == 'es':
2322
lematize = 'es_core_news_sm'
2423
else:
2524
lematize = 'en_core_web_sm'
2625

2726
nlp = spacy.load(lematize)
28-
doc = nlp(textToLematize)
29-
words = [t.orth_ for t in doc if not t.is_punct | t.is_stop]
30-
lexical_tokens = [t.lower() for t in words if t.isalpha()]
31-
32-
return " ".join(lexical_tokens)
3327

34-
try:
3528
model = sentence_transformers.SentenceTransformer('/shared/model/')
3629
phrases = text.split(",")
3730
sqlsentence = ""
3831
for textToLematize in phrases :
3932
if textToLematize != "":
40-
phrase = lematizeText(textToLematize)
33+
doc = nlp(textToLematize)
34+
words = [t.orth_ for t in doc if not t.is_punct | t.is_stop]
35+
lexical_tokens = [t.lower() for t in words if t.isalpha()]
36+
phrase = " ".join(lexical_tokens)
4137
result = ""
4238
words = phrase.split(" ")
4339
totalWords = len(words)
44-
4540
i = 0
4641
texts = []
4742
while i + 3 <= totalWords :
4843
texts.append(words[i]+" "+words[i+1]+" "+words[i+2])
4944
i = i + 1
50-
5145
if len(texts) == 0 :
5246
texts.append(phrase)
5347

5448
embeddings = model.encode(texts, normalize_embeddings=True)
5549
embeddingList = embeddings.tolist()
56-
5750
i = 0
5851
for text in texts:
59-
sqlsentence = "INSERT INTO ENCODER_Object.TextMatches (CodeId, Description, Similarity, AnalysisId, RawText) SELECT TOP 50 * FROM (SELECT CodeId, Description, VECTOR_DOT_PRODUCT(VectorDescription, TO_VECTOR('"+str(embeddingList[i])+"', DECIMAL)) AS Similarity, '"+analysisId+"', '"+text+"' FROM ENCODER_Object.Codes) WHERE Similarity > 0.8 ORDER BY Similarity DESC"
60-
utils = iris.cls("ENCODER.Utils.Manager").ExecuteInsertQuery(sqlsentence)
61-
52+
sqlsentence = "INSERT INTO ENCODER_Object.TextMatches (CodeId, Description, Similarity, AnalysisId, RawText) SELECT * FROM (SELECT CodeId, Description, VECTOR_DOT_PRODUCT(VectorDescription, TO_VECTOR('"+str(embeddingList[i])+"', DECIMAL)) AS Similarity, '"+analysisId+"', '"+text+"' FROM ENCODER_Object.Codes) WHERE Similarity > 0.8"
53+
#iris.cls("Ens.Util.Log").LogInfo("ENCODER.BP.AnalyzeTextProcess", "AnalyzeText", sqlsentence)
54+
iris.cls("ENCODER.Utils.Manager").ExecuteInsertQuery(sqlsentence)
6255
i = i + 1
6356
except Exception as err:
6457
return repr(err)

src/ENCODER/Object/TextMatches.cls

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ Property CodeId As %String;
77

88
Property Description As %String(MAXLEN = 256);
99

10-
Property Similarity As %Double;
10+
Property Similarity As %Double(STORAGEDEFAULT = "columnar");
1111

1212
Property AnalysisId As %Integer;
1313

@@ -29,12 +29,13 @@ Storage Default
2929
<Value>Description</Value>
3030
</Value>
3131
<Value name="5">
32-
<Value>Similarity</Value>
33-
</Value>
34-
<Value name="6">
3532
<Value>AnalysisId</Value>
3633
</Value>
3734
</Data>
35+
<Data name="_CDM_Similarity">
36+
<Attribute>Similarity</Attribute>
37+
<Structure>vector</Structure>
38+
</Data>
3839
<DataLocation>^ENCODER.Object.TextMatchesD</DataLocation>
3940
<DefaultData>TextMatchesDefaultData</DefaultData>
4041
<IdLocation>^ENCODER.Object.TextMatchesD</IdLocation>

0 commit comments

Comments
 (0)