Skip to content

Commit ab276ca

Browse files
Merge branch 'dev'
2 parents ec44e0e + 2f1831e commit ab276ca

File tree

17 files changed

+405
-466
lines changed

17 files changed

+405
-466
lines changed

DocToolsLLM/DocToolsLLM.py

Lines changed: 79 additions & 69 deletions
Large diffs are not rendered by default.

DocToolsLLM/__init__.py

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,6 @@
44

55
import sys
66
import fire
7-
from typing import List, Tuple
8-
from rich.markdown import Markdown
9-
from rich.console import Console
107

118
from .DocToolsLLM import DocToolsLLM_class as DocToolsLLM
129

@@ -28,17 +25,13 @@ def fire_wrapper(
2825
# --help but not catched by sys.argv
2926
if "help" in kwargs and kwargs["help"]:
3027
print("Showing help")
31-
md = Markdown(DocToolsLLM.__doc__)
32-
console = Console()
33-
console.print(md, style=None)
28+
DocToolsLLM.md_printer(DocToolsLLM.__doc__)
3429
raise SystemExit()
3530

3631
# no args given
3732
if not any([args, kwargs]):
3833
print("Empty arguments, showing help")
39-
md = Markdown(DocToolsLLM.__doc__)
40-
console = Console()
41-
console.print(md, style=None)
34+
DocToolsLLM.md_printer(DocToolsLLM.__doc__)
4235
raise SystemExit()
4336

4437
# while we're at it, make it so that
@@ -75,9 +68,7 @@ def cli_launcher() -> None:
7568
sys_args = sys.argv
7669
if "--help" in sys_args:
7770
print("Showing help")
78-
md = Markdown(DocToolsLLM.__doc__)
79-
console = Console()
80-
console.print(md, style=None)
71+
DocToolsLLM.md_printer(DocToolsLLM.__doc__)
8172
raise SystemExit()
8273
if "--completion" in sys_args:
8374
return fire.Fire(DocToolsLLM)

DocToolsLLM/docs/USAGE.md

Lines changed: 21 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
* `txt`: `--path` is path to txt
2323
* `url`: `--path` must be a valid http(s) link
2424
* `anki`: must be set: `--anki_profile`. Optional: `--anki_deck`,
25-
`--anki_notetype`, `--anki_mode`. See in loader specific arguments
25+
`--anki_notetype`, `--anki_fields`. See in loader specific arguments
2626
below for details.
2727
* `string`: no other parameters needed, will provide a field where
2828
you must type or paste the string
@@ -45,7 +45,7 @@
4545

4646
---
4747

48-
* `--modelname`: str, default `"openrouter/anthropic/claude-3.5-sonnet"`
48+
* `--modelname`: str, default `"openrouter/anthropic/claude-3.5-sonnet:beta"`
4949
* Keep in mind that given that the default backend used is litellm
5050
the part of modelname before the slash (/) is the backend name (also called provider).
5151
If the backend is 'testing/' then a fake LLM will be used
@@ -110,7 +110,7 @@
110110
if contains `hyde` but modelname contains `testing` then `hyde` will
111111
be removed.
112112

113-
* `--query_eval_modelname`: str, default `"openrouter/anthropic/claude-3.5-sonnet"`
113+
* `--query_eval_modelname`: str, default `"openrouter/anthropic/claude-3.5-sonnet:beta"`
114114
* Cheaper and quicker model than modelname. Used for intermediate
115115
steps in the RAG, not used in other tasks.
116116
If the value is not part of the model list of litellm, will use
@@ -179,8 +179,8 @@
179179
can be used for example to send notification on your phone
180180
using ntfy.sh to get summaries.
181181

182-
* `--chat_memory`: bool, default `True`
183-
* if True, will remember the messages across a given chat exchange.
182+
* `--memoryless`: bool, default `False`
183+
* if False, will remember the messages across a given chat exchange.
184184
Disabled if using a testing model.
185185

186186
* `--disable_llm_cache`: bool, default `False`
@@ -220,6 +220,9 @@
220220
* `--import_mode`: bool, default `False`
221221
* if True, will return the answer from query instead of printing it
222222

223+
* `--disable_md_printing`: bool, default `True`
224+
* if True, instead of using rich to display some information, default to simpler colored prints.
225+
223226
* `--cli_kwargs`: dict, optional
224227
* Any remaining keyword argument will be parsed as a loader
225228
specific argument ((see below)[#loader-specific-arguments]).
@@ -243,23 +246,9 @@
243246
e.g. `science::physics::freshman_year::lesson1`
244247
* `--anki_notetype`: str
245248
* If it's part of the card's notetype, that notetype will be kept.
246-
Case insensitive.
247-
249+
Case insensitive. Note that suspended cards are always ignored.
248250
* `--anki_fields`: List[str]
249251
* List of fields to keep
250-
* `--anki_mode`: str
251-
* any of `window`, `concatenate`, `singlecard`: (or _ separated
252-
value like `concatenate_window`). By default `singlecard`
253-
is used.
254-
* Modes:
255-
* `singlecard`: 1 document is 1 anki card.
256-
* `window`: 1 documents is 5 anki note, overlapping (so
257-
10 anki notes will result in 5 documents)
258-
* `concatenate`: 1 document is all anki notes concatenated as a
259-
single wall of text then split like any long document.
260-
261-
Whichever you choose, you can later filter out documents by metadata
262-
filtering over the `anki_mode` key.
263252

264253
* `--audio_backend`: str
265254
* either 'whisper' or 'deepgram' to transcribe audio.
@@ -381,13 +370,19 @@
381370
BeautifulSoup. Useful to decode html stored in .js files.
382371
Do tell me if you want more of this.
383372

384-
* `--min_lang_prob`: float, default `0.5`
373+
* `--docheck_min_lang_prob`: float, default `0.5`
385374
* float between 0 and 1 that sets the threshold under which to
386375
consider a document invalid if the estimation of
387376
fasttext's langdetect of any language is below that value.
388377
For example, setting it to 0.9 means that only documents that
389378
fasttext thinks have at least 90% probability of being a
390379
language are valid.
380+
* `--docheck_min_token`: int, default `50`
381+
* if we find less that that many token in a document, crash.
382+
* `--docheck_max_token`: int, default `1_000_000`
383+
* if we find more that that many token in a document, crash.
384+
* `--docheck_max_lines`: int, default `100_000`
385+
* if we find more that that many lines in a document, crash.
391386

392387
* `--source_tag`: str, default `None`
393388
* a string that will be added to the document metadata at the
@@ -401,8 +396,8 @@
401396
# Runtime flags
402397

403398
* `DOCTOOLS_TYPECHECKING`
404-
* Setting for runtime type checking. Default value is `disabled`.
405-
* Possible values:
406-
* `disabled`: disable typechecking
407-
* `warn`: print a red warning if a typechecking fails
408-
* `crash`: crash if a typechecking fails in any function
399+
* Setting for runtime type checking. Default value is `warn`. * Possible values:
400+
The typing is checked using [beartype](https://beartype.readthedocs.io/en/latest/) so shouldn't slow down the runtime.
401+
* `disabled`: disable typechecking.
402+
* `warn`: print a red warning if a typechecking fails.
403+
* `crash`: crash if a typechecking fails in any function.

DocToolsLLM/utils/batch_file_loader.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,7 @@
1414
from tqdm import tqdm
1515
from functools import cache as memoizer
1616
import time
17-
from typing import List, Tuple
18-
from functools import wraps
17+
from typing import List, Tuple, Union
1918
import random
2019

2120
from langchain.docstore.document import Document
@@ -164,7 +163,10 @@ def batch_load_doc(
164163
n_jobs = cli_kwargs["file_loader_n_jobs"]
165164
del cli_kwargs["file_loader_n_jobs"]
166165
else:
167-
n_jobs = 10
166+
if is_debug:
167+
n_jobs = 1
168+
else:
169+
n_jobs = 10
168170

169171
# look for unexpected keys that are not relevant to doc loading, because that would
170172
# skip the cache
@@ -242,8 +244,8 @@ def batch_load_doc(
242244
to_load[idoc]["load_functions"] = parse_load_functions(tuple(doc["load_functions"]))
243245

244246
# wrap doc_loader to cach errors cleanly
245-
@wraps(load_one_doc)
246-
def load_one_doc_wrapped(**doc_kwargs):
247+
@optional_typecheck
248+
def load_one_doc_wrapped(**doc_kwargs) -> Union[List[Document], str]:
247249
try:
248250
out = load_one_doc(**doc_kwargs)
249251
return out
@@ -258,13 +260,10 @@ def load_one_doc_wrapped(**doc_kwargs):
258260
if loading_failure == "crash" or is_debug:
259261
raise
260262
elif loading_failure == "warn":
261-
return err
263+
return str(err)
262264
else:
263265
raise ValueError(loading_failure)
264266

265-
if len(to_load) == 1 or is_debug:
266-
n_jobs = 1
267-
268267
if len(to_load) > 1:
269268
for tl in to_load:
270269
assert tl["filetype"] != "string", "You shouldn't not be using filetype 'string' with other kind of documents normally. Please open an issue on github and explain me your usecase to see how I can fix that for you!"
@@ -283,12 +282,13 @@ def load_one_doc_wrapped(**doc_kwargs):
283282

284283
docs = []
285284
t_load = time.time()
285+
if len(to_load) == 1:
286+
n_jobs = 1
286287
doc_lists = Parallel(
287288
n_jobs=n_jobs,
288289
backend=backend,
289290
)(delayed(load_one_doc_wrapped)(
290291
task=task,
291-
debug=is_debug,
292292
temp_dir=temp_dir,
293293
**d,
294294
) for d in tqdm(

DocToolsLLM/utils/embeddings.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Loads and store embeddings for each document.
44
"""
55

6-
from typing import List, Union, Optional, Any
6+
from typing import List, Union, Optional, Any, Tuple
77
import hashlib
88
import os
99
import queue
@@ -13,7 +13,6 @@
1313
from pathlib import Path, PosixPath
1414
from tqdm import tqdm
1515
import threading
16-
import lazy_import
1716

1817
import numpy as np
1918
from pydantic import Extra
@@ -31,6 +30,7 @@
3130
from .typechecker import optional_typecheck
3231
from .flags import is_verbose
3332

33+
import lazy_import
3434
litellm = lazy_import.lazy_module("litellm")
3535

3636

@@ -44,6 +44,7 @@
4444
class InstructLlamaCPPEmbeddings(LlamaCppEmbeddings, extra=Extra.allow):
4545
"""wrapper around the class LlamaCppEmbeddings to add an instruction
4646
before the text to embed."""
47+
@optional_typecheck
4748
def __init__(self, *args, **kwargs):
4849
embed_instruction=DEFAULT_EMBED_INSTRUCTION
4950
query_instruction=DEFAULT_QUERY_INSTRUCTION
@@ -58,11 +59,13 @@ def __init__(self, *args, **kwargs):
5859
self.embed_instruction = embed_instruction
5960
self.query_instruction = query_instruction
6061

62+
@optional_typecheck
6163
def embed_documents(self, texts: List[str]) -> List[List[float]]:
6264
texts = [self.embed_instruction + t for t in texts]
6365
embeddings = [self.client.embed(text) for text in texts]
6466
return [list(map(float, e)) for e in embeddings]
6567

68+
@optional_typecheck
6669
def embed_query(self, text: str) -> List[float]:
6770
text = self.query_instruction + text
6871
embedding = self.client.embed(text)
@@ -80,7 +83,7 @@ def load_embeddings(
8083
private: bool,
8184
use_rolling: bool,
8285
cli_kwargs: dict,
83-
):
86+
) -> Tuple[FAISS, CacheBackedEmbeddings]:
8487
"""loads embeddings for each document"""
8588
backend = embed_model.split("/", 1)[0]
8689
embed_model = embed_model.replace(backend + "/", "")
@@ -445,6 +448,7 @@ def faiss_saver(
445448

446449

447450
class RollingWindowEmbeddings(SentenceTransformerEmbeddings, extra=Extra.allow):
451+
@optional_typecheck
448452
def __init__(self, *args, **kwargs):
449453
assert "encode_kwargs" in kwargs
450454
if "normalize_embeddings" in kwargs["encode_kwargs"]:
@@ -457,6 +461,7 @@ def __init__(self, *args, **kwargs):
457461
super().__init__(*args, **kwargs)
458462
self.__pool_technique = pooltech
459463

464+
@optional_typecheck
460465
def embed_documents(self, texts, *args, **kwargs):
461466
"""sbert silently crops any token above the max_seq_length,
462467
so we do a windowing embedding then pool (maxpool or meanpool)

DocToolsLLM/utils/flags.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,15 @@
88
# parse args again to know globally if we're in verbose mode
99
kwargs = fire.Fire(lambda *args, **kwargs: kwargs)
1010
is_linux = platform.system() == "Linux"
11+
1112
if "debug" in kwargs and kwargs["debug"]:
1213
is_debug = True
1314
is_verbose = True
1415
else:
1516
is_debug = False
1617
is_verbose = False
18+
19+
if "disable_md_printing" in kwargs and kwargs["disable_md_printing"]:
20+
disable_md_printing = True
21+
else:
22+
disable_md_printing = False

DocToolsLLM/utils/interact.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55
from typing import Optional, Tuple, Any
66
import time
7-
import re
87
from pathlib import Path
98
import json
109
from textwrap import dedent
@@ -32,6 +31,7 @@ def get_toolbar_text(settings: dict) -> Any:
3231

3332

3433
class SettingsCompleter(Completer):
34+
@optional_typecheck
3535
def __init__(
3636
self,
3737
doctoolsCliSettings,
@@ -44,6 +44,7 @@ def __init__(
4444
self.doctoolsHistoryPrompts = doctoolsHistoryPrompts
4545
self.doctoolsHistoryWords = doctoolsHistoryWords
4646

47+
@optional_typecheck
4748
def get_completions(self, document, complete_event):
4849
text = document.text_before_cursor
4950
if not text.strip():

0 commit comments

Comments
 (0)