thiswillbeyourgithub
diff --git a/‎DocToolsLLM/DocToolsLLM.py‎
Lines changed: 79 additions & 69 deletions b/‎DocToolsLLM/DocToolsLLM.py‎
Lines changed: 79 additions & 69 deletions
diff --git a/‎DocToolsLLM/__init__.py‎
Lines changed: 3 additions & 12 deletions b/‎DocToolsLLM/__init__.py‎
Lines changed: 3 additions & 12 deletions
diff --git a/‎DocToolsLLM/docs/USAGE.md‎
Lines changed: 21 additions & 26 deletions b/‎DocToolsLLM/docs/USAGE.md‎
Lines changed: 21 additions & 26 deletions
diff --git a/‎DocToolsLLM/utils/batch_file_loader.py‎
Lines changed: 10 additions & 10 deletions b/‎DocToolsLLM/utils/batch_file_loader.py‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎DocToolsLLM/utils/embeddings.py‎
Lines changed: 8 additions & 3 deletions b/‎DocToolsLLM/utils/embeddings.py‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎DocToolsLLM/utils/flags.py‎
Lines changed: 6 additions & 0 deletions b/‎DocToolsLLM/utils/flags.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎DocToolsLLM/utils/interact.py‎
Lines changed: 2 additions & 1 deletion b/‎DocToolsLLM/utils/interact.py‎
Lines changed: 2 additions & 1 deletion
@@ -4,9 +4,6 @@
 
 import sys
 import fire
-from typing import List, Tuple
-from rich.markdown import Markdown
-from rich.console import Console
 
 from .DocToolsLLM import DocToolsLLM_class as DocToolsLLM
 
@@ -28,17 +25,13 @@ def fire_wrapper(
     # --help but not catched by sys.argv
     if "help" in kwargs and kwargs["help"]:
         print("Showing help")
-        md = Markdown(DocToolsLLM.__doc__)
-        console = Console()
-        console.print(md, style=None)
+        DocToolsLLM.md_printer(DocToolsLLM.__doc__)
         raise SystemExit()
 
     # no args given
     if not any([args, kwargs]):
         print("Empty arguments, showing help")
-        md = Markdown(DocToolsLLM.__doc__)
-        console = Console()
-        console.print(md, style=None)
+        DocToolsLLM.md_printer(DocToolsLLM.__doc__)
         raise SystemExit()
 
     # while we're at it, make it so that
@@ -75,9 +68,7 @@ def cli_launcher() -> None:
     sys_args = sys.argv
     if "--help" in sys_args:
         print("Showing help")
-        md = Markdown(DocToolsLLM.__doc__)
-        console = Console()
-        console.print(md, style=None)
+        DocToolsLLM.md_printer(DocToolsLLM.__doc__)
         raise SystemExit()
     if "--completion" in sys_args:
         return fire.Fire(DocToolsLLM)
 
@@ -22,7 +22,7 @@
         * `txt`: `--path` is path to txt
         * `url`: `--path` must be a valid http(s) link
         * `anki`: must be set: `--anki_profile`. Optional: `--anki_deck`,
-        `--anki_notetype`, `--anki_mode`. See in loader specific arguments
+        `--anki_notetype`, `--anki_fields`. See in loader specific arguments
         below for details.
         * `string`: no other parameters needed, will provide a field where
         you must type or paste the string
@@ -45,7 +45,7 @@
 
 ---
 
-* `--modelname`: str, default `"openrouter/anthropic/claude-3.5-sonnet"`
+* `--modelname`: str, default `"openrouter/anthropic/claude-3.5-sonnet:beta"`
     * Keep in mind that given that the default backend used is litellm
     the part of modelname before the slash (/) is the backend name (also called provider).
     If the backend is 'testing/' then a fake LLM will be used
@@ -110,7 +110,7 @@
     if contains `hyde` but modelname contains `testing` then `hyde` will
     be removed.
 
-* `--query_eval_modelname`: str, default `"openrouter/anthropic/claude-3.5-sonnet"`
+* `--query_eval_modelname`: str, default `"openrouter/anthropic/claude-3.5-sonnet:beta"`
     * Cheaper and quicker model than modelname. Used for intermediate
     steps in the RAG, not used in other tasks.
     If the value is not part of the model list of litellm, will use
@@ -179,8 +179,8 @@
     can be used for example to send notification on your phone
     using ntfy.sh to get summaries.
 
-* `--chat_memory`: bool, default `True`
-    * if True, will remember the messages across a given chat exchange.
+* `--memoryless`: bool, default `False`
+    * if False, will remember the messages across a given chat exchange.
     Disabled if using a testing model.
 
 * `--disable_llm_cache`: bool, default `False`
@@ -220,6 +220,9 @@
 * `--import_mode`: bool, default `False`
     * if True, will return the answer from query instead of printing it
 
+* `--disable_md_printing`: bool, default `True`
+    * if True, instead of using rich to display some information, default to simpler colored prints.
+
 * `--cli_kwargs`: dict, optional
     * Any remaining keyword argument will be parsed as a loader
     specific argument ((see below)[#loader-specific-arguments]).
@@ -243,23 +246,9 @@
     e.g. `science::physics::freshman_year::lesson1`
 * `--anki_notetype`: str
     * If it's part of the card's notetype, that notetype will be kept.
-    Case insensitive.
-
+    Case insensitive. Note that suspended cards are always ignored.
 * `--anki_fields`: List[str]
     * List of fields to keep
-* `--anki_mode`: str
-    * any of `window`, `concatenate`, `singlecard`: (or _ separated
-    value like `concatenate_window`). By default `singlecard`
-    is used.
-    * Modes:
-        * `singlecard`: 1 document is 1 anki card.
-        * `window`: 1 documents is 5 anki note, overlapping (so
-        10 anki notes will result in 5 documents)
-        * `concatenate`: 1 document is all anki notes concatenated as a
-        single wall of text then split like any long document.
-
-    Whichever you choose, you can later filter out documents by metadata
-    filtering over the `anki_mode` key.
 
 * `--audio_backend`: str
     * either 'whisper' or 'deepgram' to transcribe audio.
@@ -381,13 +370,19 @@
     BeautifulSoup. Useful to decode html stored in .js files.
     Do tell me if you want more of this.
 
-* `--min_lang_prob`: float, default `0.5`
+* `--docheck_min_lang_prob`: float, default `0.5`
     * float between 0 and 1 that sets the threshold under which to
     consider a document invalid if the estimation of
     fasttext's langdetect of any language is below that value.
     For example, setting it to 0.9 means that only documents that
     fasttext thinks have at least 90% probability of being a
     language are valid.
+* `--docheck_min_token`: int, default `50`
+    * if we find less that that many token in a document, crash.
+* `--docheck_max_token`: int, default `1_000_000`
+    * if we find more that that many token in a document, crash.
+* `--docheck_max_lines`: int, default `100_000`
+    * if we find more that that many lines in a document, crash.
 
 * `--source_tag`: str, default `None`
     * a string that will be added to the document metadata at the
@@ -401,8 +396,8 @@
 # Runtime flags
 
 * `DOCTOOLS_TYPECHECKING`
-    * Setting for runtime type checking. Default value is `disabled`.
-    * Possible values:
-        * `disabled`: disable typechecking
-        * `warn`: print a red warning if a typechecking fails
-        * `crash`: crash if a typechecking fails in any function
+    * Setting for runtime type checking. Default value is `warn`.     * Possible values:
+    The typing is checked using [beartype](https://beartype.readthedocs.io/en/latest/) so shouldn't slow down the runtime.
+        * `disabled`: disable typechecking.
+        * `warn`: print a red warning if a typechecking fails.
+        * `crash`: crash if a typechecking fails in any function.
@@ -14,8 +14,7 @@
 from tqdm import tqdm
 from functools import cache as memoizer
 import time
-from typing import List, Tuple
-from functools import wraps
+from typing import List, Tuple, Union
 import random
 
 from langchain.docstore.document import Document
@@ -164,7 +163,10 @@ def batch_load_doc(
         n_jobs = cli_kwargs["file_loader_n_jobs"]
         del cli_kwargs["file_loader_n_jobs"]
     else:
-        n_jobs = 10
+        if is_debug:
+            n_jobs = 1
+        else:
+            n_jobs = 10
 
     # look for unexpected keys that are not relevant to doc loading, because that would
     # skip the cache
@@ -242,8 +244,8 @@ def batch_load_doc(
                     to_load[idoc]["load_functions"] = parse_load_functions(tuple(doc["load_functions"]))
 
     # wrap doc_loader to cach errors cleanly
-    @wraps(load_one_doc)
-    def load_one_doc_wrapped(**doc_kwargs):
+    @optional_typecheck
+    def load_one_doc_wrapped(**doc_kwargs) -> Union[List[Document], str]:
         try:
             out = load_one_doc(**doc_kwargs)
             return out
@@ -258,13 +260,10 @@ def load_one_doc_wrapped(**doc_kwargs):
             if loading_failure == "crash" or is_debug:
                 raise
             elif loading_failure == "warn":
-                return err
+                return str(err)
             else:
                 raise ValueError(loading_failure)
 
-    if len(to_load) == 1 or is_debug:
-        n_jobs = 1
-
     if len(to_load) > 1:
         for tl in to_load:
             assert tl["filetype"] != "string", "You shouldn't not be using filetype 'string' with other kind of documents normally. Please open an issue on github and explain me your usecase to see how I can fix that for you!"
@@ -283,12 +282,13 @@ def load_one_doc_wrapped(**doc_kwargs):
 
     docs = []
     t_load = time.time()
+    if len(to_load) == 1:
+        n_jobs = 1
     doc_lists = Parallel(
         n_jobs=n_jobs,
         backend=backend,
     )(delayed(load_one_doc_wrapped)(
         task=task,
-        debug=is_debug,
         temp_dir=temp_dir,
         **d,
         ) for d in tqdm(
 
@@ -3,7 +3,7 @@
 * Loads and store embeddings for each document.
 """
 
-from typing import List, Union, Optional, Any
+from typing import List, Union, Optional, Any, Tuple
 import hashlib
 import os
 import queue
@@ -13,7 +13,6 @@
 from pathlib import Path, PosixPath
 from tqdm import tqdm
 import threading
-import lazy_import
 
 import numpy as np
 from pydantic import Extra
@@ -31,6 +30,7 @@
 from .typechecker import optional_typecheck
 from .flags import is_verbose
 
+import lazy_import
 litellm = lazy_import.lazy_module("litellm")
 
 
@@ -44,6 +44,7 @@
 class InstructLlamaCPPEmbeddings(LlamaCppEmbeddings, extra=Extra.allow):
     """wrapper around the class LlamaCppEmbeddings to add an instruction
     before the text to embed."""
+    @optional_typecheck
     def __init__(self, *args, **kwargs):
         embed_instruction=DEFAULT_EMBED_INSTRUCTION
         query_instruction=DEFAULT_QUERY_INSTRUCTION
@@ -58,11 +59,13 @@ def __init__(self, *args, **kwargs):
         self.embed_instruction = embed_instruction
         self.query_instruction = query_instruction
 
+    @optional_typecheck
     def embed_documents(self, texts: List[str]) -> List[List[float]]:
         texts = [self.embed_instruction + t for t in texts]
         embeddings = [self.client.embed(text) for text in texts]
         return [list(map(float, e)) for e in embeddings]
 
+    @optional_typecheck
     def embed_query(self, text: str) -> List[float]:
         text = self.query_instruction + text
         embedding = self.client.embed(text)
@@ -80,7 +83,7 @@ def load_embeddings(
     private: bool,
     use_rolling: bool,
     cli_kwargs: dict,
-    ):
+    ) -> Tuple[FAISS, CacheBackedEmbeddings]:
     """loads embeddings for each document"""
     backend = embed_model.split("/", 1)[0]
     embed_model = embed_model.replace(backend + "/", "")
@@ -445,6 +448,7 @@ def faiss_saver(
 
 
 class RollingWindowEmbeddings(SentenceTransformerEmbeddings, extra=Extra.allow):
+    @optional_typecheck
     def __init__(self, *args, **kwargs):
         assert "encode_kwargs" in kwargs
         if "normalize_embeddings" in kwargs["encode_kwargs"]:
@@ -457,6 +461,7 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.__pool_technique = pooltech
 
+    @optional_typecheck
     def embed_documents(self, texts, *args, **kwargs):
         """sbert silently crops any token above the max_seq_length,
         so we do a windowing embedding then pool (maxpool or meanpool)
 
@@ -8,9 +8,15 @@
 # parse args again to know globally if we're in verbose mode
 kwargs = fire.Fire(lambda *args, **kwargs: kwargs)
 is_linux = platform.system() == "Linux"
+
 if "debug" in kwargs and kwargs["debug"]:
     is_debug = True
     is_verbose = True
 else:
     is_debug = False
     is_verbose = False
+
+if "disable_md_printing" in kwargs and kwargs["disable_md_printing"]:
+    disable_md_printing = True
+else:
+    disable_md_printing = False
@@ -4,7 +4,6 @@
 
 from typing import Optional, Tuple, Any
 import time
-import re
 from pathlib import Path
 import json
 from textwrap import dedent
@@ -32,6 +31,7 @@ def get_toolbar_text(settings: dict) -> Any:
 
 
 class SettingsCompleter(Completer):
+    @optional_typecheck
     def __init__(
         self,
         doctoolsCliSettings,
@@ -44,6 +44,7 @@ def __init__(
         self.doctoolsHistoryPrompts = doctoolsHistoryPrompts
         self.doctoolsHistoryWords = doctoolsHistoryWords
 
+    @optional_typecheck
     def get_completions(self, document, complete_event):
         text = document.text_before_cursor
         if not text.strip():