Skip to content

Commit

Permalink
Update spm (marian-nmt#3)
Browse files Browse the repository at this point in the history
* Adding alternative project name for spm latest to prevent lib conflicts

* Update cmake

* Update CMakeFiles to allow for configurable artifact names

* Enables --encode_unicode_case option for case-aware sentence piece (marian-nmt#10)

* Enables --encode_unicode_case option for case-aware sentence piece
* Example: This IS a TEST OF THE CASING gets converted internally to Tthis Uis a Atest of the casing before segmentation.
* This is fully reversible.

* Enable toggling Case Encoding flag from C++ Train API (marian-nmt#11)

* Enable toggling Case Encoding flag from C++ Train API
* Fixing issue with hardcoding truth value of encode_decode_case flag

* Disable denormalizer flags (marian-nmt#13)

Co-authored-by: Rohit Jain <Rohit.Jain@microsoft.com>

* Fix Surface String to Token Mappings for Case Encoding (marian-nmt#12)

Co-authored-by: Marcin Junczys-Dowmunt <marcinjd@microsoft.com>
Co-authored-by: Rohit Jain <Rohit.Jain@microsoft.com>

* add one header file to installation

* Rename VERSION to VERSION.txt

* Rename VERSION to VERSION.txt

Installing python package fails with below error.
This change addresses this issue
```
  × python setup.py egg_info did not run successfully.
  │ exit code: 1
  ╰─> [10 lines of output]
      Traceback (most recent call last):
        File "<string>", line 2, in <module>
        File "<pip-setuptools-caller>", line 34, in <module>
        File "/home/alferre/code/sentencepiece/python/setup.py", line 111, in <module>
          version=version(),
        File "/home/alferre/code/sentencepiece/python/setup.py", line 36, in version
          with codecs.open('VERSION.txt', 'r', 'utf-8') as f:
        File "/opt/conda/envs/ptca/lib/python3.8/codecs.py", line 905, in open
          file = builtins.open(filename, mode, buffering)
      FileNotFoundError: [Errno 2] No such file or directory: 'VERSION.txt'
      [end of output]
```

---------

Co-authored-by: Rohit Jain <rjai@microsoft.com>
Co-authored-by: Rohit Jain <Rohit.Jain@microsoft.com>
Co-authored-by: Marcin Junczys-Dowmunt <marcinjd@microsoft.com>
Co-authored-by: Roman Grundkiewicz <rgrundkiewicz@gmail.com>
Co-authored-by: alexandremuzio <ax.muzio@gmail.com>
  • Loading branch information
6 people authored May 3, 2023
1 parent 60c17dd commit acb66f4
Show file tree
Hide file tree
Showing 18 changed files with 66,932 additions and 136,107 deletions.
12 changes: 11 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@
cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
file(STRINGS "VERSION.txt" SPM_VERSION)
message(STATUS "VERSION: ${SPM_VERSION}")
project(sentencepiece VERSION ${SPM_VERSION} LANGUAGES C CXX)
SET(SPM_ARTIFACT_NAME "sentencepiece" CACHE STRING "Default name of the \
generated artifacts. Override to avoid name conflicts.")

project(${SPM_ARTIFACT_NAME} VERSION ${SPM_VERSION} LANGUAGES C CXX)

option(SPM_ENABLE_NFKC_COMPILE "Enables NFKC compile" OFF)
option(SPM_ENABLE_SHARED "Builds shared libaries in addition to static libraries." ON)
Expand Down Expand Up @@ -100,6 +103,13 @@ endif()
add_subdirectory(src)
add_subdirectory(third_party)

if (NOT SPM_ARTIFACT_NAME STREQUAL "sentencepiece")
set_target_properties(sentencepiece PROPERTIES OUTPUT_NAME ${SPM_ARTIFACT_NAME})
set_target_properties(sentencepiece_train PROPERTIES OUTPUT_NAME "${SPM_ARTIFACT_NAME}_train")
set_target_properties(sentencepiece-static PROPERTIES OUTPUT_NAME ${SPM_ARTIFACT_NAME})
set_target_properties(sentencepiece_train-static PROPERTIES OUTPUT_NAME "${SPM_ARTIFACT_NAME}_train")
endif()

set(CPACK_SOURCE_GENERATOR "TXZ")
set(CPACK_GENERATOR "7Z")
set(CPACK_PACKAGE_VERSION "${SPM_VERSION}")
Expand Down
3 changes: 2 additions & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ set(SPM_SRCS
model_factory.cc
model_interface.cc
normalizer.cc
case_encoder.cc
sentencepiece_processor.cc
unigram_model.cc
util.cc
Expand Down Expand Up @@ -284,7 +285,7 @@ install(TARGETS ${SPM_INSTALLTARGETS}
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
install(FILES sentencepiece_trainer.h sentencepiece_processor.h
install(FILES sentencepiece_trainer.h sentencepiece_processor.h builtin_pb/sentencepiece.pb.h
DESTINATION ${CMAKE_INSTALL_INCDIR})

file(TO_NATIVE_PATH "${PROJECT_SOURCE_DIR}/data" data_dir)
Expand Down
77 changes: 77 additions & 0 deletions src/builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@

#include <set>

#include "case_encoder.h"
#include "normalization_rule.h"
#include "normalizer.h"
#include "third_party/darts_clone/darts.h"
Expand Down Expand Up @@ -475,6 +476,82 @@ util::Status Builder::BuildNmtNFKC_CFMap(CharsMap *chars_map) {
return util::OkStatus();
}

util::Status Builder::BuildUncaserMap(Builder::CharsMap *chars_map) {
#ifdef ENABLE_NFKC_COMPILE
LOG(INFO) << "Running BuildUncaserMap";

constexpr char32 ucMarker = (char32)normalizer::cUppercase;
constexpr char32 ncMarker = (char32)normalizer::cPunctuation;

constexpr int kMaxUnicode = 0x10FFFF;
for (char32 cp = 1; cp <= kMaxUnicode; ++cp) {
if (!U_IS_UNICODE_CHAR(cp)) {
continue;
}

if(u_ispunct(cp))
(*chars_map)[{cp}] = {ncMarker, cp};

if(u_isupper(cp)) {
const char32 trg = u_foldCase(cp, U_FOLD_CASE_DEFAULT);
if (trg != cp && u_islower(trg))
(*chars_map)[{cp}] = {ucMarker, trg};
}
}

LOG(INFO) << "Character map size for Uncaser: " << chars_map->size();

RETURN_IF_ERROR(RemoveRedundantMap(chars_map));
#endif

return util::OkStatus();
}

util::Status Builder::BuildRecaserMap(Builder::CharsMap *chars_map) {
#ifdef ENABLE_NFKC_COMPILE
LOG(INFO) << "Running BuildRecaserMap";

constexpr char32 ucMarker = (char32)normalizer::cUppercase;
constexpr char32 tcMarker = (char32)normalizer::cTitlecase;
constexpr int kMaxUnicode = 0x10FFFF;
for (char32 cp = 1; cp <= kMaxUnicode; ++cp) {
if (!U_IS_UNICODE_CHAR(cp)) {
continue;
}
if(u_isupper(cp)) {
const char32 trg = u_foldCase(cp, U_FOLD_CASE_DEFAULT);
if (trg != cp && u_islower(trg)) {
if(chars_map->find({ucMarker, trg}) == chars_map->end())
(*chars_map)[{ucMarker, trg}] = {cp};
if(chars_map->find({tcMarker, trg}) == chars_map->end())
(*chars_map)[{tcMarker, trg}] = {cp};
}
}
}

RETURN_IF_ERROR(RemoveRedundantMap(chars_map));
#endif

return util::OkStatus();
}

// static
util::Status Builder::ComposeCharsMaps(const Builder::CharsMap &outer_chars_map, Builder::CharsMap *chars_map, bool add_rest) {
for(auto& cp : *chars_map) {
auto found = outer_chars_map.find(cp.second);
if(found != outer_chars_map.end())
cp.second = found->second;
}
if(add_rest) {
for(auto& cp : outer_chars_map) {
auto found = chars_map->find(cp.first);
if(found == chars_map->end())
(*chars_map)[cp.first] = cp.second;
}
}
return util::OkStatus();
}

// static
util::Status Builder::LoadCharsMap(absl::string_view filename,
CharsMap *chars_map) {
Expand Down
6 changes: 6 additions & 0 deletions src/builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,12 @@ class Builder {
// Makes NMT NFKC with Unicode case folding.
static util::Status BuildNmtNFKC_CFMap(CharsMap *chars_map);

static util::Status BuildUncaserMap(CharsMap *chars_map);
static util::Status BuildRecaserMap(CharsMap *chars_map);

// Create composition outer_chars_map(chars_map) into `chars_map`.
static util::Status ComposeCharsMaps(const CharsMap &outer_chars_map, CharsMap *chars_map, bool add_rest);

// Builds Chars map save in `filename`.
// Format:
// src_uchar1 src_uchar2 ... <tab> trg_uchar1 trg_uchar2...
Expand Down
86 changes: 76 additions & 10 deletions src/builtin_pb/sentencepiece_model.pb.cc

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit acb66f4

Please sign in to comment.