Skip to content

Commit 4475116

Browse files
authored
Merge pull request #45 from circlemind-ai/prod
Add parameters improvements from test branch
2 parents 97328c4 + 655194c commit 4475116

File tree

8 files changed

+651
-612
lines changed

8 files changed

+651
-612
lines changed

fast_graphrag/_graphrag.py

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,11 @@ class InsertParam:
2323

2424
@dataclass
2525
class QueryParam:
26-
with_references: bool = False
27-
only_context: bool = False
28-
entities_max_tokens: int = 4000
29-
relationships_max_tokens: int = 3000
30-
chunks_max_tokens: int = 9000
26+
with_references: bool = field(default=False)
27+
only_context: bool = field(default=False)
28+
entities_max_tokens: int = field(default=4000)
29+
relations_max_tokens: int = field(default=3000)
30+
chunks_max_tokens: int = field(default=9000)
3131

3232

3333
@dataclass
@@ -177,13 +177,21 @@ async def async_query(
177177
)
178178

179179
# Retrieve relevant state
180-
relevant_state = await self.state_manager.get_context(query=query, entities=extracted_entities)
181-
if relevant_state is None:
180+
context = await self.state_manager.get_context(query=query, entities=extracted_entities)
181+
if context is None:
182182
return TQueryResponse[GTNode, GTEdge, GTHash, GTChunk](
183183
response=PROMPTS["fail_response"], context=TContext([], [], [])
184184
)
185185

186186
# Ask LLM
187+
context_str = context.truncate(
188+
max_chars={
189+
"entities": params.entities_max_tokens * TOKEN_TO_CHAR_RATIO,
190+
"relations": params.relations_max_tokens * TOKEN_TO_CHAR_RATIO,
191+
"chunks": params.chunks_max_tokens * TOKEN_TO_CHAR_RATIO,
192+
},
193+
output_context_str=not params.only_context
194+
)
187195
if params.only_context:
188196
answer = ""
189197
else:
@@ -194,19 +202,13 @@ async def async_query(
194202
llm=self.llm_service,
195203
format_kwargs={
196204
"query": query,
197-
"context": relevant_state.to_str(
198-
{
199-
"entities": params.entities_max_tokens * TOKEN_TO_CHAR_RATIO,
200-
"relationships": params.relationships_max_tokens * TOKEN_TO_CHAR_RATIO,
201-
"chunks": params.chunks_max_tokens * TOKEN_TO_CHAR_RATIO,
202-
}
203-
),
205+
"context": context_str
204206
},
205207
response_model=TAnswer,
206208
)
207209
answer = llm_response.answer
208210

209-
return TQueryResponse[GTNode, GTEdge, GTHash, GTChunk](response=answer, context=relevant_state)
211+
return TQueryResponse[GTNode, GTEdge, GTHash, GTChunk](response=answer, context=context)
210212

211213
def save_graphml(self, output_path: str) -> None:
212214
"""Save the graph in GraphML format."""

fast_graphrag/_llm/_llm_openai.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def __post_init__(self):
5757
raise ValueError("Invalid client type. Must be 'openai' or 'azure'")
5858
logger.debug("Initialized OpenAILLMService with patched OpenAI client.")
5959

60-
@throttle_async_func_call(max_concurrent=256, stagger_time=0.001, waiting_time=0.001)
60+
@throttle_async_func_call(max_concurrent=1024, stagger_time=0.001, waiting_time=0.001)
6161
async def send_message(
6262
self,
6363
prompt: str,

fast_graphrag/_prompt.py

Lines changed: 42 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -5,59 +5,57 @@
55
PROMPTS: Dict[str, Any] = {}
66

77
## NEW
8-
PROMPTS["entity_relationship_extraction"] = """You are a helpful assistant that helps a human analyst perform information discovery in the following domain.
9-
10-
# DOMAIN
8+
PROMPTS["entity_relationship_extraction"] = """# DOMAIN PROMPT
119
{domain}
1210
1311
# GOAL
14-
Given a document and a list of types, first, identify all present entities of those types and, then, all relationships among the identified entities.
1512
Your goal is to highlight information that is relevant to the domain and the questions that may be asked on it.
13+
Given an input document, identify all relevant entities and all relationships among them.
1614
1715
Examples of possible questions:
1816
{example_queries}
1917
2018
# STEPS
21-
1. Identify all entities of the given types. Make sure to extract all and only the entities that are of one of the given types, ignore the others. Use singular names and split compound concepts when necessary (for example, from the sentence "they are movie and theater directors", you should extract the entities "movie director" and "theater director").
19+
1. Identify all entities of the given types. Make sure to extract all and only the entities that are of one of the given types. Use singular names and split compound concepts when necessary (for example, from the sentence "they are movie and theater directors", you should extract the entities "movie director" and "theater director").
2220
2. Identify all relationships between the entities found in step 1. Clearly resolve pronouns to their specific names to maintain clarity.
2321
3. Double check that each entity identified in step 1 appears in at least one relationship. If not, add the missing relationships.
2422
2523
# EXAMPLE DATA
2624
Example types: [location, organization, person, communication]
27-
Example document: Radio City: Radio City is India's first private FM radio station and was started on 3 July 2001. It plays Hindi, English and regional songs. Radio City recently forayed into New Media in May 2008 with the launch of a music portal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related features."
25+
Example document: Radio City: Radio City is India's first private FM radio station and was started on 3 July 2001. It plays Hindi, English and regional songs. Radio City recently forayed into new media in May 2008 with the launch of a music portal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related features."
2826
2927
Output:
3028
{{
31-
"entities": [
32-
{{"name": "Radio City", "type": "organization", "desc": "Radio City is India's first private FM radio station."}},
33-
{{"name": "India", "type": "location", "desc": "The country of India."}},
34-
{{"name": "FM radio station", "type": "communication", "desc": "A radio station that broadcasts using frequency modulation."}},
35-
{{"name": "English", "type": "communication", "desc": "The English language."}},
36-
{{"name": "Hindi", "type": "communication", "desc": "The Hindi language."}},
37-
{{"name": "New Media", "type": "communication", "desc": "New Media is a term for all forms of media that are digital and/or interactive."}},
38-
{{"name": "PlanetRadiocity.com", "type": "organization", "desc": "PlanetRadiocity.com is an online music portal."}},
39-
{{"name": "music portal", "type": "communication", "desc": "A website that offers music related information."}},
40-
{{"name": "news", "type": "communication", "desc": "The concept of news."}},
41-
{{"name": "video", "type": "communication", "desc": "The concept of a video."}},
42-
{{"name": "song", "type": "communication", "desc": "The concept of a song."}}
43-
],
44-
"relationships": [
45-
{{"source": "Radio City", "target": "India", "desc": "Radio City is located in India."}},
46-
{{"source": "Radio City", "target": "FM radio station", "desc": "Radio City is a private FM radio station started on 3 July 2001."}},
47-
{{"source": "Radio City", "target": "English", "desc": "Radio City broadcasts English songs."}},
48-
{{"source": "Radio City", "target": "Hindi", "desc": "Radio City broadcasts songs in the Hindi language."}},
49-
{{"source": "Radio City", "target": "PlanetRadiocity.com", "desc": "Radio City launched PlanetRadiocity.com in May 2008."}},
50-
{{"source": "PlanetRadiocity.com", "target": "music portal", "desc": "PlanetRadiocity.com is a music portal that offers music related news, videos and more."}},
51-
{{"source": "PlanetRadiocity.com", "target": "video", "desc": "PlanetRadiocity.com offers music related videos."}}
52-
],
53-
"other_relationships": [
54-
{{"source": "Radio City", "target": "New Media", "desc": "Radio City forayed into New Media in May 2008."}},
55-
{{"source": "PlanetRadiocity.com", "target": "news", "desc": "PlanetRadiocity.com offers music related news."}},
56-
{{"source": "PlanetRadiocity.com", "target": "song", "desc": "PlanetRadiocity.com offers songs."}}
57-
]
29+
"entities": [
30+
{{"name": "RADIO CITY", "type": "organization", "desc": "Radio City is India's first private FM radio station"}},
31+
{{"name": "INDIA", "type": "location", "desc": "A country"}},
32+
{{"name": "FM RADIO STATION", "type": "communication", "desc": "A radio station that broadcasts using frequency modulation"}},
33+
{{"name": "ENGLISH", "type": "communication", "desc": "A language"}},
34+
{{"name": "HINDI", "type": "communication", "desc": "A language"}},
35+
{{"name": "NEW MEDIA", "type": "communication", "desc": "New media"}},
36+
{{"name": "PLANETRADIOCITY", "type": "organization", "desc": "PlanetRadiocity.com is an online music portal"}},
37+
{{"name": "MUSIC PORTAL", "type": "communication", "desc": "A website that offers music related information"}},
38+
{{"name": "NEWS", "type": "communication", "desc": "News"}},
39+
{{"name": "VIDEO", "type": "communication", "desc": "Video"}},
40+
{{"name": "SONG", "type": "communication", "desc": "Song"}}
41+
],
42+
"relationships": [
43+
{{"source": "RADIO CITY", "target": "INDIA", "desc": "Radio City is located in India"}},
44+
{{"source": "RADIO CITY", "target": "FM RADIO STATION", "desc": "Radio City is a private FM radio station started on 3 July 2001"}},
45+
{{"source": "RADIO CITY", "target": "ENGLISH", "desc": "Radio City broadcasts English songs"}},
46+
{{"source": "RADIO CITY", "target": "HINDI", "desc": "Radio City broadcasts songs in the Hindi language"}},
47+
{{"source": "RADIO CITY", "target": "PLANETRADIOCITY", "desc": "Radio City launched PlanetRadiocity.com in May 2008"}},
48+
{{"source": "PLANETRADIOCITY", "target": "MUSIC PORTAL", "desc": "PlanetRadiocity.com is a music portal"}},
49+
{{"source": "PLANETRADIOCITY", "target": "NEWS", "desc": "PlanetRadiocity.com offers music related news"}},
50+
{{"source": "PLANETRADIOCITY", "target": "SONG", "desc": "PlanetRadiocity.com offers songs"}}
51+
],
52+
"other_relationships": [
53+
{{"source": "RADIO CITY", "target": "NEW MEDIA", "desc": "Radio City forayed into new media in May 2008."}},
54+
{{"source": "PLANETRADIOCITY", "target": "VIDEO", "desc": "PlanetRadiocity.com offers music related videos"}}
55+
]
5856
}}
5957
60-
# REAL DATA
58+
# INPUT DATA
6159
Types: {entity_types}
6260
Document: {input_text}
6361
@@ -70,32 +68,30 @@
7068

7169
PROMPTS["entity_extraction_query"] = """Given the query below, your task is to extract all entities relevant to perform information retrieval to produce an answer.
7270
73-
-Example 1-
71+
-EXAMPLE 1-
7472
Query: Who directed the film that was shot in or around Leland, North Carolina in 1986?
75-
Ouput: {{"named": ["Leland", "North Carolina", "1986"], "generic": ["film director"]}}
73+
Ouput: {{"named": ["[PLACE] Leland", "[COUNTRY] North Carolina", "[YEAR] 1986"], "generic": ["film director"]}}
7674
77-
-Example 2-
75+
-EXAMPLE 2-
7876
Query: What relationship does Fred Gehrke have to the 23rd overall pick in the 2010 Major League Baseball Draft?
79-
Ouput: {{"named": ["Fred Gehrke", "2010 Major League Baseball Draft"], "generic": ["23rd baseball draft pick"]}}
77+
Ouput: {{"named": ["[BASEBALL PLAYER] Fred Gehrke", "[EVENT] 2010 Major League Baseball Draft"], "generic": ["23rd baseball draft pick"]}}
8078
81-
# INPUT
79+
-INPUT-
8280
Query: {query}
8381
Output:
8482
"""
8583

8684

87-
8885
PROMPTS[
8986
"summarize_entity_descriptions"
90-
] = """You are a helpful assistant responsible for generating a comprehensive summary of the data provided below.
91-
Given the current description, summarize it in a shorter but comprehensive description. Make sure to include all important information.
92-
If the provided description is contradictory, please resolve the contradictions and provide a single, coherent summary.
93-
Make sure it is written in third person, and include the entity names so we the have full context.
87+
] = """You are a helpful assistant responsible for generating a summary of the data provided below.
88+
Given the current description, summarize it by removing redundant and generic information. Resolve any contradictions and provide a single, coherent summary.
89+
Write in third person and explicitly include the entity names to preserve the full context.
9490
95-
Current description:
91+
Current:
9692
{description}
9793
98-
Updated description:
94+
Updated:
9995
"""
10096

10197

fast_graphrag/_services/_state_manager.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
@dataclass
3636
class DefaultStateManagerService(BaseStateManagerService[TEntity, TRelation, THash, TChunk, TId, TEmbedding]):
3737
blob_storage_cls: Type[BaseBlobStorage[csr_matrix]] = field(default=PickleBlobStorage)
38-
insert_similarity_score_threshold: float = field(default=0.8)
38+
insert_similarity_score_threshold: float = field(default=0.9)
3939
query_similarity_score_threshold: Optional[float] = field(default=0.7)
4040

4141
def __post_init__(self):
@@ -133,7 +133,7 @@ async def _get_graphs(
133133
# when selecting the index order.
134134
progress_bar.set_description("Building... [entity deduplication]")
135135
upserted_indices = np.array([i for i, _ in upserted_nodes]).reshape(-1, 1)
136-
similar_indices, scores = await self.entity_storage.get_knn(embeddings, top_k=5)
136+
similar_indices, scores = await self.entity_storage.get_knn(embeddings, top_k=3)
137137
similar_indices = np.array(similar_indices)
138138
scores = np.array(scores)
139139

@@ -190,7 +190,7 @@ async def get_context(
190190

191191
try:
192192
query_embeddings = await self.embedding_service.encode(
193-
[f"[NAME] {n}" for n in entities["named"]] + [f"[NAME] {n}" for n in entities["generic"]] + [query]
193+
[f"{n}" for n in entities["named"]] + [f"[NONE] {n}" for n in entities["generic"]] + [query]
194194
)
195195
entity_scores: List[csr_matrix] = []
196196
# Similarity-search over entities
@@ -255,7 +255,7 @@ async def get_context(
255255
if chunk is not None:
256256
relevant_chunks.append((chunk, s))
257257

258-
return TContext(entities=relevant_entities, relationships=relevant_relationships, chunks=relevant_chunks)
258+
return TContext(entities=relevant_entities, relations=relevant_relationships, chunks=relevant_chunks)
259259
except Exception as e:
260260
logger.error(f"Error during scoring of chunks and relationships.\n{e}")
261261
raise e

fast_graphrag/_storage/_vdb_hnswlib.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
@dataclass
1818
class HNSWVectorStorageConfig:
19-
ef_construction: int = field(default=256)
19+
ef_construction: int = field(default=128)
2020
M: int = field(default=64)
2121
ef_search: int = field(default=96)
2222
num_threads: int = field(default=-1)

0 commit comments

Comments
 (0)