circlemind-ai
diff --git a/‎fast_graphrag/_graphrag.py
Lines changed: 17 additions & 15 deletions b/‎fast_graphrag/_graphrag.py
Lines changed: 17 additions & 15 deletions
diff --git a/‎fast_graphrag/_llm/_llm_openai.py
Lines changed: 1 addition & 1 deletion b/‎fast_graphrag/_llm/_llm_openai.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎fast_graphrag/_prompt.py
Lines changed: 42 additions & 46 deletions b/‎fast_graphrag/_prompt.py
Lines changed: 42 additions & 46 deletions
diff --git a/‎fast_graphrag/_services/_state_manager.py
Lines changed: 4 additions & 4 deletions b/‎fast_graphrag/_services/_state_manager.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎fast_graphrag/_storage/_vdb_hnswlib.py
Lines changed: 1 addition & 1 deletion b/‎fast_graphrag/_storage/_vdb_hnswlib.py
Lines changed: 1 addition & 1 deletion
@@ -23,11 +23,11 @@ class InsertParam:
 
 @dataclass
 class QueryParam:
-    with_references: bool = False
-    only_context: bool = False
-    entities_max_tokens: int = 4000
-    relationships_max_tokens: int = 3000
-    chunks_max_tokens: int = 9000
+    with_references: bool = field(default=False)
+    only_context: bool = field(default=False)
+    entities_max_tokens: int = field(default=4000)
+    relations_max_tokens: int = field(default=3000)
+    chunks_max_tokens: int = field(default=9000)
 
 
 @dataclass
@@ -177,13 +177,21 @@ async def async_query(
         )
 
         # Retrieve relevant state
-        relevant_state = await self.state_manager.get_context(query=query, entities=extracted_entities)
-        if relevant_state is None:
+        context = await self.state_manager.get_context(query=query, entities=extracted_entities)
+        if context is None:
             return TQueryResponse[GTNode, GTEdge, GTHash, GTChunk](
                 response=PROMPTS["fail_response"], context=TContext([], [], [])
             )
 
         # Ask LLM
+        context_str = context.truncate(
+            max_chars={
+                "entities": params.entities_max_tokens * TOKEN_TO_CHAR_RATIO,
+                "relations": params.relations_max_tokens * TOKEN_TO_CHAR_RATIO,
+                "chunks": params.chunks_max_tokens * TOKEN_TO_CHAR_RATIO,
+            },
+            output_context_str=not params.only_context
+        )
         if params.only_context:
             answer = ""
         else:
@@ -194,19 +202,13 @@ async def async_query(
                 llm=self.llm_service,
                 format_kwargs={
                     "query": query,
-                    "context": relevant_state.to_str(
-                        {
-                            "entities": params.entities_max_tokens * TOKEN_TO_CHAR_RATIO,
-                            "relationships": params.relationships_max_tokens * TOKEN_TO_CHAR_RATIO,
-                            "chunks": params.chunks_max_tokens * TOKEN_TO_CHAR_RATIO,
-                        }
-                    ),
+                    "context": context_str
                 },
                 response_model=TAnswer,
             )
             answer = llm_response.answer
 
-        return TQueryResponse[GTNode, GTEdge, GTHash, GTChunk](response=answer, context=relevant_state)
+        return TQueryResponse[GTNode, GTEdge, GTHash, GTChunk](response=answer, context=context)
 
     def save_graphml(self, output_path: str) -> None:
         """Save the graph in GraphML format."""
 
@@ -57,7 +57,7 @@ def __post_init__(self):
             raise ValueError("Invalid client type. Must be 'openai' or 'azure'")
         logger.debug("Initialized OpenAILLMService with patched OpenAI client.")
 
-    @throttle_async_func_call(max_concurrent=256, stagger_time=0.001, waiting_time=0.001)
+    @throttle_async_func_call(max_concurrent=1024, stagger_time=0.001, waiting_time=0.001)
     async def send_message(
         self,
         prompt: str,
 
@@ -5,59 +5,57 @@
 PROMPTS: Dict[str, Any] = {}
 
 ## NEW
-PROMPTS["entity_relationship_extraction"] = """You are a helpful assistant that helps a human analyst perform information discovery in the following domain.
-
-# DOMAIN
+PROMPTS["entity_relationship_extraction"] = """# DOMAIN PROMPT
 {domain}
 
 # GOAL
-Given a document and a list of types, first, identify all present entities of those types and, then, all relationships among the identified entities.
 Your goal is to highlight information that is relevant to the domain and the questions that may be asked on it.
+Given an input document, identify all relevant entities and all relationships among them.
 
 Examples of possible questions:
 {example_queries}
 
 # STEPS
-1. Identify all entities of the given types. Make sure to extract all and only the entities that are of one of the given types, ignore the others. Use singular names and split compound concepts when necessary (for example, from the sentence "they are movie and theater directors", you should extract the entities "movie director" and "theater director").
+1. Identify all entities of the given types. Make sure to extract all and only the entities that are of one of the given types. Use singular names and split compound concepts when necessary (for example, from the sentence "they are movie and theater directors", you should extract the entities "movie director" and "theater director").
 2. Identify all relationships between the entities found in step 1. Clearly resolve pronouns to their specific names to maintain clarity.
 3. Double check that each entity identified in step 1 appears in at least one relationship. If not, add the missing relationships.
 
 # EXAMPLE DATA
 Example types: [location, organization, person, communication]
-Example document: Radio City: Radio City is India's first private FM radio station and was started on 3 July 2001. It plays Hindi, English and regional songs. Radio City recently forayed into New Media in May 2008 with the launch of a music portal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related features."
+Example document: Radio City: Radio City is India's first private FM radio station and was started on 3 July 2001. It plays Hindi, English and regional songs. Radio City recently forayed into new media in May 2008 with the launch of a music portal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related features."
 
 Output:
 {{
-	"entities": [
-	{{"name": "Radio City", "type": "organization", "desc": "Radio City is India's first private FM radio station."}},
-	{{"name": "India", "type": "location", "desc": "The country of India."}},
-	{{"name": "FM radio station", "type": "communication", "desc": "A radio station that broadcasts using frequency modulation."}},
-	{{"name": "English", "type": "communication", "desc": "The English language."}},
-	{{"name": "Hindi", "type": "communication", "desc": "The Hindi language."}},
-	{{"name": "New Media", "type": "communication", "desc": "New Media is a term for all forms of media that are digital and/or interactive."}},
-	{{"name": "PlanetRadiocity.com", "type": "organization", "desc": "PlanetRadiocity.com is an online music portal."}},
-	{{"name": "music portal", "type": "communication", "desc": "A website that offers music related information."}},
-	{{"name": "news", "type": "communication", "desc": "The concept of news."}},
-	{{"name": "video", "type": "communication", "desc": "The concept of a video."}},
-	{{"name": "song", "type": "communication", "desc": "The concept of a song."}}
-	],
-	"relationships": [
-	{{"source": "Radio City", "target": "India", "desc": "Radio City is located in India."}},
-	{{"source": "Radio City", "target": "FM radio station", "desc": "Radio City is a private FM radio station started on 3 July 2001."}},
-	{{"source": "Radio City", "target": "English", "desc": "Radio City broadcasts English songs."}},
-	{{"source": "Radio City", "target": "Hindi", "desc": "Radio City broadcasts songs in the Hindi language."}},
-	{{"source": "Radio City", "target": "PlanetRadiocity.com", "desc": "Radio City launched PlanetRadiocity.com in May 2008."}},
-	{{"source": "PlanetRadiocity.com", "target": "music portal", "desc": "PlanetRadiocity.com is a music portal that offers music related news, videos and more."}},
-	{{"source": "PlanetRadiocity.com", "target": "video", "desc": "PlanetRadiocity.com offers music related videos."}}
-	],
-	"other_relationships": [
-	{{"source": "Radio City", "target": "New Media", "desc": "Radio City forayed into New Media in May 2008."}},
-	{{"source": "PlanetRadiocity.com", "target": "news", "desc": "PlanetRadiocity.com offers music related news."}},
-	{{"source": "PlanetRadiocity.com", "target": "song", "desc": "PlanetRadiocity.com offers songs."}}
-	]
+"entities": [
+	{{"name": "RADIO CITY", "type": "organization", "desc": "Radio City is India's first private FM radio station"}},
+	{{"name": "INDIA", "type": "location", "desc": "A country"}},
+	{{"name": "FM RADIO STATION", "type": "communication", "desc": "A radio station that broadcasts using frequency modulation"}},
+	{{"name": "ENGLISH", "type": "communication", "desc": "A language"}},
+	{{"name": "HINDI", "type": "communication", "desc": "A language"}},
+	{{"name": "NEW MEDIA", "type": "communication", "desc": "New media"}},
+	{{"name": "PLANETRADIOCITY", "type": "organization", "desc": "PlanetRadiocity.com is an online music portal"}},
+	{{"name": "MUSIC PORTAL", "type": "communication", "desc": "A website that offers music related information"}},
+	{{"name": "NEWS", "type": "communication", "desc": "News"}},
+	{{"name": "VIDEO", "type": "communication", "desc": "Video"}},
+	{{"name": "SONG", "type": "communication", "desc": "Song"}}
+],
+"relationships": [
+	{{"source": "RADIO CITY", "target": "INDIA", "desc": "Radio City is located in India"}},
+	{{"source": "RADIO CITY", "target": "FM RADIO STATION", "desc": "Radio City is a private FM radio station started on 3 July 2001"}},
+	{{"source": "RADIO CITY", "target": "ENGLISH", "desc": "Radio City broadcasts English songs"}},
+	{{"source": "RADIO CITY", "target": "HINDI", "desc": "Radio City broadcasts songs in the Hindi language"}},
+	{{"source": "RADIO CITY", "target": "PLANETRADIOCITY", "desc": "Radio City launched PlanetRadiocity.com in May 2008"}},
+	{{"source": "PLANETRADIOCITY", "target": "MUSIC PORTAL", "desc": "PlanetRadiocity.com is a music portal"}},
+	{{"source": "PLANETRADIOCITY", "target": "NEWS", "desc": "PlanetRadiocity.com offers music related news"}},
+	{{"source": "PLANETRADIOCITY", "target": "SONG", "desc": "PlanetRadiocity.com offers songs"}}
+],
+"other_relationships": [
+	{{"source": "RADIO CITY", "target": "NEW MEDIA", "desc": "Radio City forayed into new media in May 2008."}},
+	{{"source": "PLANETRADIOCITY", "target": "VIDEO", "desc": "PlanetRadiocity.com offers music related videos"}}
+]
 }}
 
-# REAL DATA
+# INPUT DATA
 Types: {entity_types}
 Document: {input_text}
 
@@ -70,32 +68,30 @@
 
 PROMPTS["entity_extraction_query"] = """Given the query below, your task is to extract all entities relevant to perform information retrieval to produce an answer.
 
--Example 1-
+-EXAMPLE 1-
 Query: Who directed the film that was shot in or around Leland, North Carolina in 1986?
-Ouput: {{"named": ["Leland", "North Carolina", "1986"], "generic": ["film director"]}}
+Ouput: {{"named": ["[PLACE] Leland", "[COUNTRY] North Carolina", "[YEAR] 1986"], "generic": ["film director"]}}
 
--Example 2-
+-EXAMPLE 2-
 Query: What relationship does Fred Gehrke have to the 23rd overall pick in the 2010 Major League Baseball Draft?
-Ouput: {{"named": ["Fred Gehrke", "2010 Major League Baseball Draft"], "generic": ["23rd baseball draft pick"]}}
+Ouput: {{"named": ["[BASEBALL PLAYER] Fred Gehrke", "[EVENT] 2010 Major League Baseball Draft"], "generic": ["23rd baseball draft pick"]}}
 
-# INPUT
+-INPUT-
 Query: {query}
 Output:
 """
 
 
-
 PROMPTS[
 	"summarize_entity_descriptions"
-] = """You are a helpful assistant responsible for generating a comprehensive summary of the data provided below.
-Given the current description, summarize it in a shorter but comprehensive description. Make sure to include all important information.
-If the provided description is contradictory, please resolve the contradictions and provide a single, coherent summary.
-Make sure it is written in third person, and include the entity names so we the have full context.
+] = """You are a helpful assistant responsible for generating a summary of the data provided below.
+Given the current description, summarize it by removing redundant and generic information. Resolve any contradictions and provide a single, coherent summary.
+Write in third person and explicitly include the entity names to preserve the full context.
 
-Current description:
+Current:
 {description}
 
-Updated description:
+Updated:
 """
 
 
 
@@ -35,7 +35,7 @@
 @dataclass
 class DefaultStateManagerService(BaseStateManagerService[TEntity, TRelation, THash, TChunk, TId, TEmbedding]):
     blob_storage_cls: Type[BaseBlobStorage[csr_matrix]] = field(default=PickleBlobStorage)
-    insert_similarity_score_threshold: float = field(default=0.8)
+    insert_similarity_score_threshold: float = field(default=0.9)
     query_similarity_score_threshold: Optional[float] = field(default=0.7)
 
     def __post_init__(self):
@@ -133,7 +133,7 @@ async def _get_graphs(
         # when selecting the index order.
         progress_bar.set_description("Building... [entity deduplication]")
         upserted_indices = np.array([i for i, _ in upserted_nodes]).reshape(-1, 1)
-        similar_indices, scores = await self.entity_storage.get_knn(embeddings, top_k=5)
+        similar_indices, scores = await self.entity_storage.get_knn(embeddings, top_k=3)
         similar_indices = np.array(similar_indices)
         scores = np.array(scores)
 
@@ -190,7 +190,7 @@ async def get_context(
 
         try:
             query_embeddings = await self.embedding_service.encode(
-                [f"[NAME] {n}" for n in entities["named"]] + [f"[NAME] {n}" for n in entities["generic"]] + [query]
+                [f"{n}" for n in entities["named"]] + [f"[NONE] {n}" for n in entities["generic"]] + [query]
             )
             entity_scores: List[csr_matrix] = []
             # Similarity-search over entities
@@ -255,7 +255,7 @@ async def get_context(
                 if chunk is not None:
                     relevant_chunks.append((chunk, s))
 
-            return TContext(entities=relevant_entities, relationships=relevant_relationships, chunks=relevant_chunks)
+            return TContext(entities=relevant_entities, relations=relevant_relationships, chunks=relevant_chunks)
         except Exception as e:
             logger.error(f"Error during scoring of chunks and relationships.\n{e}")
             raise e
 
@@ -16,7 +16,7 @@
 
 @dataclass
 class HNSWVectorStorageConfig:
-    ef_construction: int = field(default=256)
+    ef_construction: int = field(default=128)
     M: int = field(default=64)
     ef_search: int = field(default=96)
     num_threads: int = field(default=-1)