Skip to content

Commit

Permalink
Add docstrings. Add summary model(not visible for now considering tha…
Browse files Browse the repository at this point in the history
…t it works bad).
  • Loading branch information
TMN committed Nov 25, 2023
1 parent 2a7bf87 commit 8a93a7d
Show file tree
Hide file tree
Showing 9 changed files with 131 additions and 2 deletions.
56 changes: 55 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions project/source/ml/entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@


def predict_entity(text: str):
"""
Highlights entities such as date, organization, location in text.
:param text:
:return res | text:
"""
try:
word_descriptions = pipe.predict(text)
res = {
Expand Down
5 changes: 5 additions & 0 deletions project/source/ml/sentiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@


def predict_sentiment(text: str):
"""
Defines sentiment of text.
:param text:
:return label:
"""
try:
output = sentiment_analysis.predict(text)
label = output[0]["label"]
Expand Down
45 changes: 45 additions & 0 deletions project/source/ml/summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

MODEL_NAME = "cointegrated/rut5-base-absum"
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model.eval()


def summarize(
text,
n_words=None,
compression=None,
max_length=1000,
num_beams=3,
do_sample=False,
repetition_penalty=10.0,
**kwargs
):
"""
Summarize the text
The following parameters are mutually exclusive:
- n_words (int) is an approximate number of words to generate.
- compression (float) is an approximate length ratio of summary and original text.
"""
try:
if n_words:
text = "[{}] ".format(n_words) + text
elif compression:
text = "[{0:.1g}] ".format(compression) + text
x = tokenizer(text, return_tensors="pt", padding=True).to(model.device)
with torch.inference_mode():
out = model.generate(
**x,
max_length=max_length,
num_beams=num_beams,
do_sample=do_sample,
repetition_penalty=repetition_penalty,
**kwargs
)
except Exception:
print("###_Exception in summary prediction_###")
return text
else:
return tokenizer.decode(out[0], skip_special_tokens=True)
11 changes: 11 additions & 0 deletions project/source/string_processing/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@


def find_start_end(string, pattern):
"""
Finds start and end indexes of pattern in string(for highlighting).
:param string:
:param pattern:
:return:
"""
for elem in r"\.^$*+?{}[]|()":
pattern = pattern.replace(f"{elem}", rf"\{elem}")
matches = re.finditer(pattern, string)
Expand All @@ -20,6 +26,11 @@ def find_start_end(string, pattern):


def highlight_words(text: str) -> str:
"""
Creates html block with highlighted objects;
:param text:
:return:
"""
try:
to_highlight, new_text = string_validator(text)
text_for_model = preprocess_for_model(text)
Expand Down
2 changes: 1 addition & 1 deletion project/source/string_processing/validator_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def string_validator(raw_text: str):
def replace_day(raw_text: str) -> str:

"""
replacing text data with datatime
Convert text date/time words into datetime format entity.
Parameters
----------
Expand Down
5 changes: 5 additions & 0 deletions project/source/templates/main.html
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@
<div class="text-input-container">
<label for="post_content" class="form-subtitle">Введите обращение:</label><br>
<textarea class="form-control" id="post_content" name="post_content" style="height: 100px; border: 1px solid;"></textarea><br>
{% if summary %}
<div class="text-input-container">
{{summary}}
</div>
{% endif %}
<div class="button-container">
<button type="submit" class="btn btn-primary" style="margin-top: 0; margin-bottom: 2%;">Отправить</button>
<a class="btn btn-outline-primary" style="width: 105px; margin-bottom: 2%;" href="/dashboard">Статистика</a>
Expand Down
3 changes: 3 additions & 0 deletions project/source/web/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from project.source.string_processing.utilities import highlight_words
from project.source.ml.inference import setup_model
from project.source.ml.sentiment import predict_sentiment
from project.source.ml.summary import summarize
from project.source.utils import theme_to_group
from project.source.web.models import Appeal
import pandas as pd
Expand All @@ -28,6 +29,7 @@ def main_win(request):
if text == "":
return render(request, "main.html")
else:
# _summary = summarize(text)
sentiment = predict_sentiment(text)
sentiment_cases = {
"negative": "negative",
Expand Down Expand Up @@ -59,6 +61,7 @@ def main_win(request):
"theme": themes[0],
"group": groups[0],
"sentiment": sentiment_addition,
# "summary": _summary,
},
)
else:
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ django = "^4.2.7"
emoji = "^2.8.0"
plotly = "^5.18.0"
datetime = "^5.3"
sentencepiece = "^0.1.99"

[tool.poetry.dev-dependencies]
pytest = "^6.2.5"
Expand Down

0 comments on commit 8a93a7d

Please sign in to comment.