Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

updated readme and example #314

Merged
merged 3 commits into from
Jun 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
344 changes: 155 additions & 189 deletions README.md

Large diffs are not rendered by default.

10 changes: 6 additions & 4 deletions querent/core/transformers/bert_ner_opensourcellm.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import uuid
from transformers import AutoConfig, AutoTokenizer
import transformers
import time
Expand Down Expand Up @@ -298,7 +299,6 @@ async def process_tokens(self, data: IngestedTokens):
try:
doc_entity_pairs = []
doc_source = data.doc_source

if not BERTLLM.validate_ingested_tokens(data):
self.set_termination_event()
return
Expand All @@ -311,7 +311,7 @@ async def process_tokens(self, data: IngestedTokens):
doc_entity_pairs = self._get_entity_pairs(content)
if not doc_entity_pairs:
return

doc_entity_pairs = self._process_entity_types(doc_entity_pairs)
if not self.entity_context_extractor and not self.predicate_context_extractor:
pairs_withattn = self.attn_scores_instance.extract_and_append_attention_weights(doc_entity_pairs)
Expand Down Expand Up @@ -341,6 +341,7 @@ def _prepare_content(self, data):
else:
content = clean_text
file = data.get_file_path()

return content, file

def _get_entity_pairs(self, content):
Expand Down Expand Up @@ -414,8 +415,9 @@ async def _process_embedding_triples(self, embedding_triples, file, doc_source):
for triple in embedding_triples:
if self.termination_event.is_set():
return
event_id = str(uuid.uuid4())

graph_json = json.dumps(TripleToJsonConverter.convert_graphjson(triple))
graph_json = json.dumps(TripleToJsonConverter.convert_graphjson(triple, event_id=event_id))
if graph_json:
current_state = EventState(
event_type=EventType.Graph,
Expand All @@ -436,7 +438,7 @@ async def _process_embedding_triples(self, embedding_triples, file, doc_source):
base_weights=[predicate_score, predicate_score, 3],
normalize_weights=True # Normalize weights to ensure they sum to 1
)
vector_json = json.dumps(TripleToJsonConverter.convert_vectorjson(triple=triple, embeddings=final_emb))
vector_json = json.dumps(TripleToJsonConverter.convert_vectorjson(triple=triple, embeddings=final_emb,event_id=event_id))
if vector_json:
current_state = EventState(
event_type=EventType.Vector,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -172,9 +172,10 @@ def process_tokens(ner_instance : NER_LLM, extractor, filtered_triples, nlp_mode
updated_triples = []
for subject, predicate_metadata, object in filtered_triples:
try:
context = predicate_metadata['current_sentence'].replace("\n"," ")
context = predicate_metadata['current_sentence'].replace("\n"," ").lower()
head_positions = ner_instance.find_subword_indices(context, predicate_metadata['entity1_nn_chunk'])
tail_positions = ner_instance.find_subword_indices(context, predicate_metadata['entity2_nn_chunk'])

if head_positions[0][0] > tail_positions[0][0]:
head_entity = {'entity': object, 'noun_chunk':predicate_metadata['entity2_nn_chunk'], 'entity_label':predicate_metadata['entity2_label'] }
tail_entity = {'entity': subject, 'noun_chunk':predicate_metadata['entity1_nn_chunk'], 'entity_label':predicate_metadata['entity1_label']}
Expand All @@ -188,20 +189,18 @@ def process_tokens(ner_instance : NER_LLM, extractor, filtered_triples, nlp_mode
attention_matrix = extractor.inference_attention(model_input)
token_idx_with_word = ner_instance.tokenize_sentence_with_positions(context)
spacy_doc = nlp_model(context)
filter = IndividualFilter(True, 0.02, token_idx_with_word, spacy_doc)

filter = IndividualFilter(True, 0.01, token_idx_with_word, spacy_doc)
## HEAD Entity Based Attention Search
candidate_paths = perform_search(entity_pair.head_entity['start_idx'], attention_matrix, entity_pair, search_candidates=5, require_contiguous=True, max_relation_length=8, num_initial_tokens=extractor.num_start_tokens())
candidate_paths = remove_duplicates(candidate_paths)
filtered_results = filter.filter(candidates=candidate_paths,e_pair=entity_pair)
predicate_he, score_he = get_best_relation(filtered_results)

##TAIL ENTITY Based Attention Search
candidate_paths = perform_search(entity_pair.tail_entity['start_idx'], attention_matrix, entity_pair, search_candidates=5, require_contiguous=True, max_relation_length=8, num_initial_tokens=extractor.num_start_tokens())
candidate_paths = remove_duplicates(candidate_paths)
filtered_results = filter.filter(candidates=candidate_paths,e_pair=entity_pair)
predicate_te, score_te = get_best_relation(filtered_results)

if score_he > score_te and (score_he >= 0.1 or score_te >= 0.1):
triple = create_semantic_triple(head_entity=head_entity['noun_chunk'],
tail_entity=tail_entity['noun_chunk'],
Expand Down
6 changes: 4 additions & 2 deletions querent/kg/rel_helperfunctions/triple_to_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,15 @@ def _parse_json_str(json_str):
raise ValueError(f"Error decoding JSON: {e}")

@staticmethod
def convert_graphjson(triple):
def convert_graphjson(triple, event_id = None):
try:
subject, json_str, object_ = triple
predicate_info = TripleToJsonConverter._parse_json_str(json_str)
if predicate_info is None:
return {}

json_object = {
"event_id": event_id,
"subject": TripleToJsonConverter._normalize_text(subject, replace_space=True),
"subject_type": TripleToJsonConverter._normalize_text(predicate_info.get("subject_type", "Unlabeled"), replace_space=True),
"object": TripleToJsonConverter._normalize_text(object_, replace_space=True),
Expand Down Expand Up @@ -67,7 +68,7 @@ def dynamic_weighted_average_embeddings(embeddings, base_weights, normalize_weig
return weighted_sum

@staticmethod
def convert_vectorjson(triple, blob = None, embeddings=None):
def convert_vectorjson(triple, blob = None, embeddings=None, event_id = None):
try:
subject, json_str, object_ = triple
data = TripleToJsonConverter._parse_json_str(json_str)
Expand All @@ -76,6 +77,7 @@ def convert_vectorjson(triple, blob = None, embeddings=None):

id_format = f"{TripleToJsonConverter._normalize_text(subject,replace_space=True)}-{TripleToJsonConverter._normalize_text(data.get('predicate', ''),replace_space=True)}-{TripleToJsonConverter._normalize_text(object_,replace_space=True)}"
json_object = {
"event_id": event_id,
"id": TripleToJsonConverter._normalize_text(id_format),
"embeddings": embeddings.tolist(),
"size": len(embeddings.tolist()),
Expand Down
Binary file added tests/data/readme_assets/example.pdf
Binary file not shown.
Empty file added tests/tutorial/__init__.py
Empty file.
23 changes: 23 additions & 0 deletions tests/tutorial/docker-compose.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
version: '3'
services:
postgres:
image: pgvector/pgvector:pg16
environment:
- POSTGRES_USER=querent
- POSTGRES_PASSWORD=querent
- POSTGRES_DB=querent_test
volumes:
- ./quester/storage/sql/:/docker-entrypoint-initdb.d
ports:
- "5432:5432"
networks:
- querent
healthcheck:
test: ["CMD-SHELL", "pg_isready", "-d", "querent_test"]
interval: 30s
timeout: 60s
retries: 5
start_period: 80s

networks:
querent:
102 changes: 102 additions & 0 deletions tests/tutorial/example_fixed_entities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import asyncio
from asyncio import Queue
import json
from pathlib import Path
from querent.callback.event_callback_interface import EventCallbackInterface
from querent.collectors.fs.fs_collector import FSCollectorFactory
from querent.common.types.querent_event import EventState, EventType
from querent.config.collector.collector_config import FSCollectorConfig
from querent.common.uri import Uri
from querent.config.core.llm_config import LLM_Config
from querent.ingestors.ingestor_manager import IngestorFactoryManager
import uuid
import numpy as np
from querent.core.transformers.bert_ner_opensourcellm import BERTLLM
from querent.querent.resource_manager import ResourceManager
from querent.querent.querent import Querent
from postgres_utility import DatabaseManager

async def ingest_all_async():
db_manager = DatabaseManager(
dbname="querent_test",
user="querent",
password="querent",
host="localhost",
port="5432"
)

db_manager.connect_db()
db_manager.create_tables()
directories = ["/home/nishantg/querent-main/querent/tests/data/readme_assets"]
collectors = [
FSCollectorFactory().resolve(
Uri("file://" + str(Path(directory).resolve())),
FSCollectorConfig(config_source={
"id": str(uuid.uuid4()),
"root_path": directory,
"name": "Local-config",
"config": {},
"backend": "localfile",
"uri": "file://",
}),
)
for directory in directories
]

result_queue = asyncio.Queue()

ingestor_factory_manager = IngestorFactoryManager(
collectors=collectors, result_queue=result_queue
)
ingest_task = asyncio.create_task(ingestor_factory_manager.ingest_all_async())
resource_manager = ResourceManager()
bert_llm_config = LLM_Config(
# ner_model_name="English",
rel_model_type = "bert",
rel_model_path = 'bert-base-uncased',
fixed_entities = [
"university", "greenwood", "liam zheng", "department", "Metroville",
"Emily Stanton", "Coach", "health", "training", "athletes"
],
sample_entities = [
"organization", "organization", "person", "department", "city",
"person", "person", "method", "method", "person"
],
is_confined_search = True
)
llm_instance = BERTLLM(result_queue, bert_llm_config)

class StateChangeCallback(EventCallbackInterface):
def handle_event(self, event_type: EventType, event_state: EventState):
if event_state['event_type'] == EventType.Graph:
triple = json.loads(event_state['payload'])
db_manager.insert_metadata(
event_id=triple['event_id'],
subject=triple['subject'],
subject_type=triple['subject_type'],
predicate=triple['predicate'],
object=triple['object'],
object_type=triple['object_type'],
sentence=triple['sentence'],
file=event_state['file'],
doc_source=event_state['doc_source'],
score=triple['score']
)
elif event_state['event_type'] == EventType.Vector:
triple_v = json.loads(event_state['payload'])
db_manager.insert_embedding(
event_id=triple_v['event_id'],
embeddings=triple_v['embeddings'],
)

llm_instance.subscribe(EventType.Graph, StateChangeCallback())
llm_instance.subscribe(EventType.Vector, StateChangeCallback())
querent = Querent(
[llm_instance],
resource_manager=resource_manager,
)
querent_task = asyncio.create_task(querent.start())
await asyncio.gather(ingest_task, querent_task)

if __name__ == "__main__":
asyncio.run(ingest_all_async())
Loading
Loading