Skip to content

Commit

Permalink
Update settings.yaml
Browse files Browse the repository at this point in the history
  • Loading branch information
severian42 authored Jul 24, 2024
1 parent db34802 commit 1a1f58e
Showing 1 changed file with 135 additions and 68 deletions.
203 changes: 135 additions & 68 deletions ragtest/settings.yaml
Original file line number Diff line number Diff line change
@@ -1,78 +1,145 @@
async_mode: threaded
cache:
base_dir: cache
type: file
chunks:
group_by_columns:
- id
overlap: 64
size: 512
claim_extraction:
description: Any claims or facts that could be relevant to information discovery.
max_gleanings: 0
prompt: prompts/claim_extraction.txt
cluster_graph:
max_cluster_size: 10
community_report:
max_input_length: 8000
max_length: 2000
prompt: prompts/community_report.txt
embed_graph:
enabled: false

encoding_model: cl100k_base
skip_workflows: []
llm:
api_key: ${GRAPHRAG_API_KEY}
type: openai_chat # or azure_openai_chat
model: mistral-nemo:12b-instruct-2407-fp16
model_supports_json: true # recommended if this is available for your model.
max_tokens: 8192
# request_timeout: 180.0
api_base: http://localhost:11434/v1
# api_version: 2024-02-15-preview
# organization: <organization_id>
# deployment_name: <azure_model_deployment_name>
# tokens_per_minute: 150_000 # set a leaky bucket throttle
# requests_per_minute: 10_000 # set a leaky bucket throttle
max_retries: 3
# max_retry_wait: 10.0
# sleep_on_rate_limit_recommendation: true # whether to sleep when azure suggests wait-times
concurrent_requests: 25 # the number of parallel inflight requests that may be made

parallelization:
stagger: 0.3
num_threads: 50 # the number of threads to use for parallel processing

async_mode: threaded # or asyncio

embeddings:
async_mode: threaded
## parallelization: override the global parallelization settings for embeddings
async_mode: threaded # or asyncio
llm:
api_base: http://localhost:11434/api
api_key: ${GRAPHRAG_API_KEY}
concurrent_requests: 25
type: openai_embedding # or azure_openai_embedding
model: nomic-embed-text:latest
model_supports_json: true
provider: openai_embedding
type: openai_embedding
encoding_model: cl100k_base
entity_extraction:
entity_types:
- organization
- person
- geo
- event
max_gleanings: 0
prompt: prompts/entity_extraction.txt
global_search:
concurrency: 32
api_base: http://localhost:11434/api
# api_version: 2024-02-15-preview
# organization: <organization_id>
# deployment_name: <azure_model_deployment_name>
# tokens_per_minute: 150_000 # set a leaky bucket throttle
# requests_per_minute: 10_000 # set a leaky bucket throttle
max_retries: 3
# max_retry_wait: 10.0
# sleep_on_rate_limit_recommendation: true # whether to sleep when azure suggests wait-times
concurrent_requests: 25 # the number of parallel inflight requests that may be made
#batch_size: 1 # the number of documents to send in a single request
#batch_max_tokens: 4000 # the maximum number of tokens to send in a single request
# target: required # or optional



chunks:
size: 512
overlap: 64
group_by_columns: [id] # by default, we don't allow chunks to cross documents

input:
base_dir: input
type: file # or blob
file_type: text # or csv
base_dir: "input"
file_encoding: utf-8
file_pattern: .*\.txt$
file_type: text
type: file
llm:
api_base: http://localhost:11434/v1
api_key: ${GRAPHRAG_API_KEY}
concurrent_requests: 25
max_tokens: 1024
model: qwen2:7b
model_supports_json: true
provider: openai_chat
temperature: 0.5
type: openai
local_search: null
parallelization:
num_threads: 50
stagger: 0.3
reporting:
base_dir: output/${timestamp}/reports
type: file
skip_workflows: []
snapshots:
graphml: true
raw_entities: true
top_level_nodes: true
file_pattern: ".*\\.txt$"

cache:
type: file # or blob
base_dir: "cache"
# connection_string: <azure_blob_storage_connection_string>
# container_name: <azure_blob_storage_container_name>

storage:
base_dir: output/${timestamp}/artifacts
type: file
type: file # or blob
base_dir: "output/${timestamp}/artifacts"
# connection_string: <azure_blob_storage_connection_string>
# container_name: <azure_blob_storage_container_name>

reporting:
type: file # or console, blob
base_dir: "output/${timestamp}/reports"
# connection_string: <azure_blob_storage_connection_string>
# container_name: <azure_blob_storage_container_name>

entity_extraction:
## llm: override the global llm settings for this task
## parallelization: override the global parallelization settings for this task
## async_mode: override the global async_mode settings for this task
prompt: "prompts/entity_extraction.txt"
entity_types: [organization,person,geo,event]
max_gleanings: 0

summarize_descriptions:
## llm: override the global llm settings for this task
## parallelization: override the global parallelization settings for this task
## async_mode: override the global async_mode settings for this task
prompt: "prompts/summarize_descriptions.txt"
max_length: 500
prompt: prompts/summarize_descriptions.txt

claim_extraction:
## llm: override the global llm settings for this task
## parallelization: override the global parallelization settings for this task
## async_mode: override the global async_mode settings for this task
# enabled: true
prompt: "prompts/claim_extraction.txt"
description: "Any claims or facts that could be relevant to information discovery."
max_gleanings: 0

community_reports:
## llm: override the global llm settings for this task
## parallelization: override the global parallelization settings for this task
## async_mode: override the global async_mode settings for this task
prompt: "prompts/community_report.txt"
max_length: 2000
max_input_length: 4000

cluster_graph:
max_cluster_size: 10

embed_graph:
enabled: false # if true, will generate node2vec embeddings for nodes
# num_walks: 10
# walk_length: 40
# window_size: 2
# iterations: 3
# random_seed: 597832

umap:
enabled: false
enabled: false # if true, will generate UMAP embeddings for nodes

snapshots:
graphml: false
raw_entities: false
top_level_nodes: false

local_search:
# text_unit_prop: 0.5
# community_prop: 0.1
# conversation_history_max_turns: 5
# top_k_mapped_entities: 10
# top_k_relationships: 10
# max_tokens: 12000

global_search:
# max_tokens: 12000
# data_max_tokens: 12000
# map_max_tokens: 1000
# reduce_max_tokens: 2000
# concurrency: 32

0 comments on commit 1a1f58e

Please sign in to comment.