Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable MacOS and Windows build #8

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,59 @@ jobs:
name: wheels-linux-${{ matrix.platform.target }}
path: dist

windows:
runs-on: ${{ matrix.platform.runner }}
strategy:
matrix:
platform:
- runner: windows-latest
target: x64
- runner: windows-latest
target: x86
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: 3.x
architecture: ${{ matrix.platform.target }}
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
target: ${{ matrix.platform.target }}
args: --release --out dist --find-interpreter
sccache: 'true'
- name: Upload wheels
uses: actions/upload-artifact@v4
with:
name: wheels-windows-${{ matrix.platform.target }}
path: dist

macos:
runs-on: ${{ matrix.platform.runner }}
strategy:
matrix:
platform:
- runner: macos-12
target: x86_64
- runner: macos-14
target: aarch64
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: 3.x
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
target: ${{ matrix.platform.target }}
args: --release --out dist --find-interpreter
sccache: 'true'
- name: Upload wheels
uses: actions/upload-artifact@v4
with:
name: wheels-macos-${{ matrix.platform.target }}
path: dist

sdist:
runs-on: ubuntu-22.04
steps:
Expand Down
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Ignore binary model files
# but keep heliport.data dir so maturin picks it up when doing a clean build
# Training files
*.train

# Wheels
wheels*
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## v0.8.0
### Added
- Model creation command.

### Changed
- Include binarized model in the wheel.
Expand Down
10 changes: 7 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ crate-type = ["lib", "cdylib"]
[workspace]
members = ["heliport-model"]

[profile.release]
lto = "thin"

[build-dependencies]
heliport-model = { path = "heliport-model" }
anyhow = "1.0"
Expand All @@ -29,7 +32,7 @@ ordered-float = "4.2"
log = { version = "0.4" }
env_logger = "0.10"
strum = { version = "0.25", features = ["derive"] }
pyo3 = { version = "0.22", features = ["gil-refs", "anyhow"], optional = true }
pyo3 = { version = "0.23", features = ["anyhow"], optional = true }
target = { version = "2.1.0", optional = true }
tempfile = { version = "3", optional = true }
reqwest = { version = "0.12", features = ["stream", "rustls-tls"], optional = true }
Expand All @@ -40,13 +43,14 @@ anyhow = "1.0"
rayon = "1.10"
itertools = "0.11"
lazy_static = "1.5"
counter = "0.6.0"

[dev-dependencies]
test-log = "0.2.15"

[features]
# Put log features in default, to allow crates using heli as a library, disable them
default = ["cli", "log/max_level_debug", "log/release_max_level_debug"]
cli = ["python", "dep:clap", "dep:target"]
download = ["dep:tokio", "dep:tempfile", "dep:reqwest", "dep:futures-util"]
cli = ["python", "dep:clap"]
download = ["dep:tokio", "dep:tempfile", "dep:reqwest", "dep:futures-util", "dep:target"]
python = ["dep:pyo3"]
33 changes: 23 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ Install it in your environment
pip install heliport
```

NOTE: Since version 0.8 models do not need to be downloaded anymore.

### From source
Install the requirements:
- Python
Expand Down Expand Up @@ -50,16 +52,27 @@ Arguments:
[OUTPUT_FILE] Output file, default: stdout

Options:
-j, --threads <THREADS> Number of parallel threads to use.
0 means no multi-threading
1 means running the identification in a separated thread
>1 run multithreading [default: 0]
-b, --batch-size <BATCH_SIZE> Number of text segments to pre-load for parallel processing [default: 100000]
-c, --ignore-confidence Ignore confidence thresholds. Predictions under the thresholds will not be labeled as 'und'
-s, --print-scores Print confidence score (higher is better) or raw score (higher is better) in case '-c' is provided
-m, --model-dir <MODEL_DIR> Model directory containing binarized model or plain text model. Default is Python module path or './LanguageModels' if relevant languages are requested
-l, --relevant-langs <RELEVANT_LANGS> Load only relevant languages. Specify a comma-separated list of language codes. Needs plain text model directory
-h, --help Print help
-j, --threads <THREADS>
Number of parallel threads to use.
0 means no multi-threading
1 means running the identification in a separated thread
>1 run multithreading [default: 0]
-b, --batch-size <BATCH_SIZE>
Number of text segments to pre-load for parallel processing [default: 100000]
-c, --ignore-confidence
Ignore confidence thresholds. Predictions under the thresholds will not be
labeled as 'und'
-s, --print-scores
Print confidence score (higher is better) or raw score (higher is better) in case
'-c' is provided
-m, --model-dir <MODEL_DIR>
Model directory containing binarized model or plain text model. Default is Python
module path or './LanguageModels' if relevant languages are requested
-l, --relevant-langs <RELEVANT_LANGS>
Load only relevant languages. Specify a comma-separated list of language codes.
Needs plain text model directory
-h, --help
Print help
```

### Python package
Expand Down
1 change: 1 addition & 0 deletions heliport-model/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ strum = { version = "0.25", features = ["derive"] }
strum_macros = "0.25"
wyhash2 = "0.2.1"
anyhow = "1.0"
rayon = "1.10"
32 changes: 22 additions & 10 deletions heliport-model/src/languagemodel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ use std::thread;
use anyhow::{Context, Result, bail};
use bitcode;
use log::{info, debug, warn};
use rayon::prelude::*;
use strum::{Display, EnumCount, IntoEnumIterator};
use strum_macros::EnumIter;

Expand Down Expand Up @@ -87,6 +88,7 @@ impl ModelNgram {
dic: HashMap::default(),
model_type: model_type.clone(),
};
let model_repr = model_type.to_string();

// Open languagelist for this model
let lang_list = fs::read_to_string(model_dir.join("languagelist"))
Expand All @@ -99,7 +101,7 @@ impl ModelNgram {
let lang_repr = lang.to_string().to_lowercase();
// Models may not have all the language codes supported by the library
if !lang_list.contains(&lang_repr[..]) {
warn!("Language '{lang_repr}' not found in languagelist, omitting");
warn!("{model_repr}: Language '{lang_repr}' not found in languagelist, omitting");
continue;
}

Expand Down Expand Up @@ -291,16 +293,26 @@ impl Index<usize> for Model {

/// Binarize models and save in a path
pub fn binarize(save_path: &Path, model_path: &Path) -> Result<()> {
for model_type in OrderNgram::iter() {
let type_repr = model_type.to_string();
info!("Loading {type_repr} model");
let model = ModelNgram::from_text(&model_path, model_type, None)?;
let size = model.dic.len();
info!("Created {size} entries");
let filename = save_path.join(format!("{type_repr}.bin"));
info!("Saving {type_repr} model");
model.save(Path::new(&filename))?;
let orders: Vec<_ > = OrderNgram::iter().collect();

let results: Vec<Result<_>> = orders
.par_iter()
.panic_fuse()
.map(|model_type| -> Result<()> {
let type_repr = model_type.to_string();
info!("{type_repr}: loading text model");
let model = ModelNgram::from_text(&model_path, model_type.clone(), None)?;
let size = model.dic.len();
let filename = save_path.join(format!("{type_repr}.bin"));
info!("{type_repr}: saving binarized model with {size} entries");
model.save(Path::new(&filename))
}).collect();

// If there is one error, propagate
for r in results {
let _ = r?;
}

info!("Copying confidence thresholds file");
fs::copy(
model_path.join(Model::CONFIDENCE_FILE),
Expand Down
51 changes: 51 additions & 0 deletions src/cli/create_models.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
use std::path::{PathBuf};
use std::process::exit;
use std::time::Instant;

use anyhow::Context;
use clap::Args;
use log::{info, error};
use pyo3::prelude::*;
use rayon::prelude::*;

use crate::utils::Abort;
use crate::trainer::count_all_ngrams;

#[derive(Args, Clone)]
pub struct CreateModelCmd {
#[arg(help="Output directory to save the ngram frequency files")]
output_dir: PathBuf,
#[arg(help="Directory where input text files are located")]
input_files: Vec<PathBuf>,
#[arg(short = 'k', long, default_value_t = 10000, help="Truncate at top-k most frequent n-grams")]
topk: usize,
}

impl CreateModelCmd {
pub fn cli(self) -> PyResult<()> {
info!("Starting");
let now = Instant::now();

if !self.output_dir.exists() {
error!("Output directory '{}' does not exist, please create it", self.output_dir.display());
exit(1);
}

info!("Saving top {} most frequent n-grams", self.topk);

// Train each file/language in parallel
// use panic_fuse to fail early if one of the jobs fail
self.input_files
.into_par_iter()
.panic_fuse()
.for_each(|lang_file| {
count_all_ngrams(&lang_file, &self.output_dir, self.topk)
.with_context(|| format!("Error with file '{}'", lang_file.display()))
.or_abort(1);
});

info!("Finished");
info!("Elapsed time: {:.2?}", now.elapsed());
Ok(())
}
}
14 changes: 13 additions & 1 deletion src/cli/identify.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@ use std::io::{self, BufRead, BufReader, Write, BufWriter};
use std::fs::File;
use std::path::{Path, PathBuf};
use std::str::FromStr;
use std::time::Instant;

use anyhow::{Context, Result};
use clap::Args;
use itertools::Itertools;
use log::{debug};
use log::{info, debug};
use pyo3::prelude::*;

use heliport_model::Lang;
Expand Down Expand Up @@ -71,10 +72,14 @@ fn parse_langs(langs_text: &Vec<String>) -> Result<Vec<Lang>> {

impl IdentifyCmd {
pub fn cli(self) -> PyResult<()> {
info!("Starting");
let now = Instant::now();

// If provided, parse the list of relevant languages
let mut relevant_langs = None;
if let Some(r) = &self.relevant_langs {
relevant_langs = Some(parse_langs(&r).or_abort(1));
info!("Using relevant langs: {:?}", relevant_langs.as_ref().unwrap());
}
debug!("{:?}", self);

Expand Down Expand Up @@ -106,19 +111,26 @@ impl IdentifyCmd {
output_file = Box::new(io::stdout().lock());
}

info!("Loading model");
// Load identifier
let mut identifier = Identifier::load(&model_dir, relevant_langs)
.or_abort(1);
if self.ignore_confidence {
info!("Disabled confidence thresholds");
identifier.disable_confidence();
}

// do not run on separated threads if multithreading is not requested
if self.threads == 0 {
info!("Running single-threaded");
self.run_single(identifier, input_file, output_file).or_abort(1);
} else {
info!("Running with {} threads", self.threads);
self.run_parallel(identifier, input_file, output_file).or_abort(1);
}

info!("Finished");
info!("Elapsed time: {:.2?}", now.elapsed());
Ok(())
}

Expand Down
13 changes: 12 additions & 1 deletion src/cli/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ mod identify;
#[cfg(feature = "download")]
mod download;
mod binarize;
mod create_models;

use clap::{Subcommand, Parser};
use log::{debug};
Expand All @@ -13,12 +14,15 @@ use crate::python::module_path;
use self::download::DownloadCmd;
use self::binarize::BinarizeCmd;
use self::identify::IdentifyCmd;
use self::create_models::CreateModelCmd;

#[derive(Parser, Clone)]
#[command(version, about, long_about = None)]
pub struct Cli {
#[command(subcommand)]
command: Commands,
#[arg(short, long, help="Do not print log messages")]
quiet: bool,
}

#[derive(Subcommand, Clone)]
Expand All @@ -31,6 +35,8 @@ enum Commands {
Binarize(BinarizeCmd),
#[command(about="Identify languages of input text", visible_alias="detect")]
Identify(IdentifyCmd),
#[command(about="Create heliport models")]
CreateModel(CreateModelCmd),
}


Expand All @@ -41,12 +47,17 @@ pub fn cli_run() -> PyResult<()> {
let os_args = std::env::args_os().skip(1);
let args = Cli::parse_from(os_args);
debug!("Module path found at: {}", module_path().expect("Could not found module path").display());
env_logger::Builder::from_env(Env::default().default_filter_or("info")).init();
if !args.quiet {
env_logger::Builder::from_env(Env::default().default_filter_or("info")).init();
} else {
env_logger::Builder::from_env(Env::default().default_filter_or("error")).init();
}

match args.command {
#[cfg(feature = "download")]
Commands::Download(cmd) => { cmd.cli() },
Commands::Binarize(cmd) => { cmd.cli() },
Commands::Identify(cmd) => { cmd.cli() },
Commands::CreateModel(cmd) => { cmd.cli() },
}
}
Loading
Loading