Skip to content

Commit

Permalink
Modify changes from budokans's PR to limit conversion of "--" to ' - …
Browse files Browse the repository at this point in the history
…' to only occasions when the original character was an em dash.
  • Loading branch information
kshetline committed Aug 26, 2021
1 parent 89a403e commit e76d42f
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 3 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Change Log

## 1.0.3

* Special handling for em dashes: In smart spacing mode, em dashes become " - " instead of "--" when found between words.

## 1.0.2

* German mode now works with combining umlaut (diaeresis).
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"transliteration",
"emoji"
],
"version": "1.0.2",
"version": "1.0.3",
"dependencies": {},
"repository": "https://github.com/kshetline/unidecode-plus.git",
"author": "Kerry Shetline <kerry@shetline.com>",
Expand Down
7 changes: 7 additions & 0 deletions test/unidecode.mocha.js
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,13 @@ describe('Smart spacing', function() {
it('should replace an em-dash straddled by word characters with " - " instead of "--"', function() {
assert.equal(
unidecode("No—I mean yes!", { smartSpacing: true }), "No - I mean yes!");
assert.equal(
unidecode("#—#", { smartSpacing: true }), "#--#");
});

it('should leave ASCII double dashes unchanged', function() {
assert.equal(
unidecode("No--I mean yes!", { smartSpacing: true }), "No--I mean yes!");
});

it('should handle deferred smart spacing', function() {
Expand Down
7 changes: 5 additions & 2 deletions unidecode.js
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ function unidecode_internal_replace(ch) {
var high = cp >> 8;
var row = high + (high === 0 && german ? 0.5 : 0);
var low = cp & 0xFF;
var emDash = cp === 0x2014;
// This doesn't cover all emoji, just those currently defined.
var emoji = (high === 0x1F4 || high === 0x1F6 || high === 0x1F9);

Expand Down Expand Up @@ -96,6 +97,8 @@ function unidecode_internal_replace(ch) {

ch = tr[row][low];

if (smartSpacing && emDash)
return '\x80--\x80';
if (!smartSpacing || ch === '[?]' || ch === '_' || /^\w+$/.test(ch))
return ch;
else if (emoji)
Expand All @@ -106,13 +109,13 @@ function unidecode_internal_replace(ch) {

function resolveSpacing(str) {
return str
.replace(/(\w)(\x80--\x80)(\w)/g, function(_, p1, _2, p3) { return p1 + ' - ' + p3; })
.replace(/\x80(?!\w)/g, "")
.replace(/\x80\x80|(\w)\x80/g, "$1\x81")
.replace(/\x80/g, "")
.replace(/^\x81+|\x81+$/g, "")
.replace(/\x81 \x81/g, " ")
.replace(/\s?\x81+/g, " ")
.replace(/(\w)(--)(\w)/g, (_, p1, _2, p3) => `${p1} - ${p3}`);
.replace(/\s?\x81+/g, " ");
}

module.exports.resolveSpacing = resolveSpacing;

0 comments on commit e76d42f

Please sign in to comment.