ZJaume · ZJaume · Oct 31, 2024 · Nov 1, 2024 · Nov 1, 2024 · Nov 1, 2024
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -45,6 +45,59 @@ jobs:
           name: wheels-linux-${{ matrix.platform.target }}
           path: dist
 
+  windows:
+    runs-on: ${{ matrix.platform.runner }}
+    strategy:
+      matrix:
+        platform:
+          - runner: windows-latest
+            target: x64
+          - runner: windows-latest
+            target: x86
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: 3.x
+          architecture: ${{ matrix.platform.target }}
+      - name: Build wheels
+        uses: PyO3/maturin-action@v1
+        with:
+          target: ${{ matrix.platform.target }}
+          args: --release --out dist --find-interpreter
+          sccache: 'true'
+      - name: Upload wheels
+        uses: actions/upload-artifact@v4
+        with:
+          name: wheels-windows-${{ matrix.platform.target }}
+          path: dist
+
+  macos:
+    runs-on: ${{ matrix.platform.runner }}
+    strategy:
+      matrix:
+        platform:
+          - runner: macos-12
+            target: x86_64
+          - runner: macos-14
+            target: aarch64
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: 3.x
+      - name: Build wheels
+        uses: PyO3/maturin-action@v1
+        with:
+          target: ${{ matrix.platform.target }}
+          args: --release --out dist --find-interpreter
+          sccache: 'true'
+      - name: Upload wheels
+        uses: actions/upload-artifact@v4
+        with:
+          name: wheels-macos-${{ matrix.platform.target }}
+          path: dist
+
   sdist:
     runs-on: ubuntu-22.04
     steps:

diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,5 @@
-# Ignore binary model files
-# but keep heliport.data dir so maturin picks it up when doing a clean build
+# Training files
+*.train
 
 # Wheels
 wheels*

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## v0.8.0
 ### Added
+- Model creation command.
 
 ### Changed
 - Include binarized model in the wheel.

diff --git a/Cargo.toml b/Cargo.toml
@@ -14,6 +14,9 @@ crate-type = ["lib", "cdylib"]
 [workspace]
 members = ["heliport-model"]
 
+[profile.release]
+lto = "thin"
+
 [build-dependencies]
 heliport-model = { path = "heliport-model" }
 anyhow = "1.0"
@@ -29,7 +32,7 @@ ordered-float = "4.2"
 log = { version = "0.4" }
 env_logger = "0.10"
 strum = { version = "0.25", features = ["derive"] }
-pyo3 = { version = "0.22", features = ["gil-refs", "anyhow"], optional = true }
+pyo3 = { version = "0.23", features = ["anyhow"], optional = true }
 target = { version = "2.1.0", optional = true }
 tempfile = { version = "3", optional = true }
 reqwest = { version = "0.12", features = ["stream", "rustls-tls"], optional = true }
@@ -40,13 +43,14 @@ anyhow = "1.0"
 rayon = "1.10"
 itertools = "0.11"
 lazy_static = "1.5"
+counter = "0.6.0"
 
 [dev-dependencies]
 test-log = "0.2.15"
 
 [features]
 # Put log features in default, to allow crates using heli as a library, disable them
 default = ["cli", "log/max_level_debug", "log/release_max_level_debug"]
-cli = ["python", "dep:clap", "dep:target"]
-download = ["dep:tokio", "dep:tempfile", "dep:reqwest", "dep:futures-util"]
+cli = ["python", "dep:clap"]
+download = ["dep:tokio", "dep:tempfile", "dep:reqwest", "dep:futures-util", "dep:target"]
 python = ["dep:pyo3"]
diff --git a/README.md b/README.md
@@ -14,6 +14,8 @@ Install it in your environment
 pip install heliport
 ```
 
+NOTE: Since version 0.8 models do not need to be downloaded anymore.
+
 ### From source
 Install the requirements:
  - Python
@@ -50,16 +52,27 @@ Arguments:
   [OUTPUT_FILE]  Output file, default: stdout
 
 Options:
-  -j, --threads <THREADS>                Number of parallel threads to use.
-                                         0 means no multi-threading
-                                         1 means running the identification in a separated thread
-                                         >1 run multithreading [default: 0]
-  -b, --batch-size <BATCH_SIZE>          Number of text segments to pre-load for parallel processing [default: 100000]
-  -c, --ignore-confidence                Ignore confidence thresholds. Predictions under the thresholds will not be labeled as 'und'
-  -s, --print-scores                     Print confidence score (higher is better) or raw score (higher is better) in case '-c' is provided
-  -m, --model-dir <MODEL_DIR>            Model directory containing binarized model or plain text model. Default is Python module path or './LanguageModels' if relevant languages are requested
-  -l, --relevant-langs <RELEVANT_LANGS>  Load only relevant languages. Specify a comma-separated list of language codes. Needs plain text model directory
-  -h, --help                             Print help
+  -j, --threads <THREADS>
+          Number of parallel threads to use.
+          0 means no multi-threading
+          1 means running the identification in a separated thread
+          >1 run multithreading [default: 0]
+  -b, --batch-size <BATCH_SIZE>
+          Number of text segments to pre-load for parallel processing [default: 100000]
+  -c, --ignore-confidence
+          Ignore confidence thresholds. Predictions under the thresholds will not be
+          labeled as 'und'
+  -s, --print-scores
+          Print confidence score (higher is better) or raw score (higher is better) in case
+          '-c' is provided
+  -m, --model-dir <MODEL_DIR>
+          Model directory containing binarized model or plain text model. Default is Python
+          module path or './LanguageModels' if relevant languages are requested
+  -l, --relevant-langs <RELEVANT_LANGS>
+          Load only relevant languages. Specify a comma-separated list of language codes.
+          Needs plain text model directory
+  -h, --help
+          Print help
 ```
 
 ### Python package

diff --git a/heliport-model/Cargo.toml b/heliport-model/Cargo.toml
@@ -10,3 +10,4 @@ strum = { version = "0.25", features = ["derive"] }
 strum_macros = "0.25"
 wyhash2 = "0.2.1"
 anyhow = "1.0"
+rayon = "1.10"
diff --git a/heliport-model/src/languagemodel.rs b/heliport-model/src/languagemodel.rs
@@ -10,6 +10,7 @@ use std::thread;
 use anyhow::{Context, Result, bail};
 use bitcode;
 use log::{info, debug, warn};
+use rayon::prelude::*;
 use strum::{Display, EnumCount, IntoEnumIterator};
 use strum_macros::EnumIter;
 
@@ -87,6 +88,7 @@ impl ModelNgram {
             dic: HashMap::default(),
             model_type: model_type.clone(),
         };
+        let model_repr = model_type.to_string();
 
         // Open languagelist for this model
         let lang_list = fs::read_to_string(model_dir.join("languagelist"))
@@ -99,7 +101,7 @@ impl ModelNgram {
             let lang_repr = lang.to_string().to_lowercase();
             // Models may not have all the language codes supported by the library
             if !lang_list.contains(&lang_repr[..]) {
-                warn!("Language '{lang_repr}' not found in languagelist, omitting");
+                warn!("{model_repr}: Language '{lang_repr}' not found in languagelist, omitting");
                 continue;
             }
 
@@ -291,16 +293,26 @@ impl Index<usize> for Model {
 
 /// Binarize models and save in a path
 pub fn binarize(save_path: &Path, model_path: &Path) -> Result<()> {
-    for model_type in OrderNgram::iter() {
-        let type_repr = model_type.to_string();
-        info!("Loading {type_repr} model");
-        let model = ModelNgram::from_text(&model_path, model_type, None)?;
-        let size = model.dic.len();
-        info!("Created {size} entries");
-        let filename = save_path.join(format!("{type_repr}.bin"));
-        info!("Saving {type_repr} model");
-        model.save(Path::new(&filename))?;
+    let orders: Vec<_ > = OrderNgram::iter().collect();
+
+    let results: Vec<Result<_>> = orders
+        .par_iter()
+        .panic_fuse()
+        .map(|model_type| -> Result<()> {
+            let type_repr = model_type.to_string();
+            info!("{type_repr}: loading text model");
+            let model = ModelNgram::from_text(&model_path, model_type.clone(), None)?;
+            let size = model.dic.len();
+            let filename = save_path.join(format!("{type_repr}.bin"));
+            info!("{type_repr}: saving binarized model with {size} entries");
+            model.save(Path::new(&filename))
+        }).collect();
+
+    // If there is one error, propagate
+    for r in results {
+        let _ = r?;
     }
+
     info!("Copying confidence thresholds file");
     fs::copy(
         model_path.join(Model::CONFIDENCE_FILE),

diff --git a/src/cli/create_models.rs b/src/cli/create_models.rs
@@ -0,0 +1,51 @@
+use std::path::{PathBuf};
+use std::process::exit;
+use std::time::Instant;
+
+use anyhow::Context;
+use clap::Args;
+use log::{info, error};
+use pyo3::prelude::*;
+use rayon::prelude::*;
+
+use crate::utils::Abort;
+use crate::trainer::count_all_ngrams;
+
+#[derive(Args, Clone)]
+pub struct CreateModelCmd {
+    #[arg(help="Output directory to save the ngram frequency files")]
+    output_dir: PathBuf,
+    #[arg(help="Directory where input text files are located")]
+    input_files: Vec<PathBuf>,
+    #[arg(short = 'k', long, default_value_t = 10000, help="Truncate at top-k most frequent n-grams")]
+    topk: usize,
+}
+
+impl CreateModelCmd {
+    pub fn cli(self) -> PyResult<()> {
+        info!("Starting");
+        let now = Instant::now();
+
+        if !self.output_dir.exists() {
+            error!("Output directory '{}' does not exist, please create it", self.output_dir.display());
+            exit(1);
+        }
+
+        info!("Saving top {} most frequent n-grams", self.topk);
+
+        // Train each file/language in parallel
+        // use panic_fuse to fail early if one of the jobs fail
+        self.input_files
+            .into_par_iter()
+            .panic_fuse()
+            .for_each(|lang_file| {
+                count_all_ngrams(&lang_file, &self.output_dir, self.topk)
+                    .with_context(|| format!("Error with file '{}'", lang_file.display()))
+                    .or_abort(1);
+            });
+
+        info!("Finished");
+        info!("Elapsed time: {:.2?}", now.elapsed());
+        Ok(())
+    }
+}
diff --git a/src/cli/identify.rs b/src/cli/identify.rs
@@ -2,11 +2,12 @@ use std::io::{self, BufRead, BufReader, Write, BufWriter};
 use std::fs::File;
 use std::path::{Path, PathBuf};
 use std::str::FromStr;
+use std::time::Instant;
 
 use anyhow::{Context, Result};
 use clap::Args;
 use itertools::Itertools;
-use log::{debug};
+use log::{info, debug};
 use pyo3::prelude::*;
 
 use heliport_model::Lang;
@@ -71,10 +72,14 @@ fn parse_langs(langs_text: &Vec<String>) -> Result<Vec<Lang>> {
 
 impl IdentifyCmd {
     pub fn cli(self) -> PyResult<()> {
+        info!("Starting");
+        let now = Instant::now();
+
         // If provided, parse the list of relevant languages
         let mut relevant_langs = None;
         if let Some(r) = &self.relevant_langs {
             relevant_langs = Some(parse_langs(&r).or_abort(1));
+            info!("Using relevant langs: {:?}", relevant_langs.as_ref().unwrap());
         }
         debug!("{:?}", self);
 
@@ -106,19 +111,26 @@ impl IdentifyCmd {
             output_file = Box::new(io::stdout().lock());
         }
 
+        info!("Loading model");
         // Load identifier
         let mut identifier = Identifier::load(&model_dir, relevant_langs)
             .or_abort(1);
         if self.ignore_confidence {
+            info!("Disabled confidence thresholds");
             identifier.disable_confidence();
         }
 
         // do not run on separated threads if multithreading is not requested
         if self.threads == 0 {
+            info!("Running single-threaded");
             self.run_single(identifier, input_file, output_file).or_abort(1);
         } else {
+            info!("Running with {} threads", self.threads);
             self.run_parallel(identifier, input_file, output_file).or_abort(1);
         }
+
+        info!("Finished");
+        info!("Elapsed time: {:.2?}", now.elapsed());
         Ok(())
     }
 

diff --git a/src/cli/mod.rs b/src/cli/mod.rs
@@ -2,6 +2,7 @@ mod identify;
 #[cfg(feature = "download")]
 mod download;
 mod binarize;
+mod create_models;
 
 use clap::{Subcommand, Parser};
 use log::{debug};
@@ -13,12 +14,15 @@ use crate::python::module_path;
 use self::download::DownloadCmd;
 use self::binarize::BinarizeCmd;
 use self::identify::IdentifyCmd;
+use self::create_models::CreateModelCmd;
 
 #[derive(Parser, Clone)]
 #[command(version, about, long_about = None)]
 pub struct Cli {
     #[command(subcommand)]
     command: Commands,
+    #[arg(short, long, help="Do not print log messages")]
+    quiet: bool,
 }
 
 #[derive(Subcommand, Clone)]
@@ -31,6 +35,8 @@ enum Commands {
     Binarize(BinarizeCmd),
     #[command(about="Identify languages of input text", visible_alias="detect")]
     Identify(IdentifyCmd),
+    #[command(about="Create heliport models")]
+    CreateModel(CreateModelCmd),
 }
 
 
@@ -41,12 +47,17 @@ pub fn cli_run() -> PyResult<()> {
     let os_args = std::env::args_os().skip(1);
     let args = Cli::parse_from(os_args);
     debug!("Module path found at: {}", module_path().expect("Could not found module path").display());
-    env_logger::Builder::from_env(Env::default().default_filter_or("info")).init();
+    if !args.quiet {
+        env_logger::Builder::from_env(Env::default().default_filter_or("info")).init();
+    } else {
+        env_logger::Builder::from_env(Env::default().default_filter_or("error")).init();
+    }
 
     match args.command {
         #[cfg(feature = "download")]
         Commands::Download(cmd) => { cmd.cli() },
         Commands::Binarize(cmd) => { cmd.cli() },
         Commands::Identify(cmd) => { cmd.cli() },
+        Commands::CreateModel(cmd) => { cmd.cli() },
     }
 }