-
Notifications
You must be signed in to change notification settings - Fork 1
/
2ascii
executable file
·129 lines (125 loc) · 1.91 KB
/
2ascii
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/bin/sed -f
# -*- coding: utf-8 -*-
#
# NAME
#
# 2ascii - convert UTF-8 punctuation to "equivalent" ASCII
#
# SYNOPSIS
#
# 2ascii [-i] [<file>...]
#
# DESCRIPTION
#
# 2ascii is a sed(1) script to convert UTF-8 punctuation to ASCII.
# You can sanitize the file further by piping it through iconv:
#
# iconv -c -f utf-8 -t ascii//TRANSLIT [<file>...]
#
# OPTIONS
#
# -i Edit files in place
#
# Fix (long) dashes.
s/\([[:alnum:]]\)[–‒—―]\([[:alnum:]]\)/\1 - \2/g
# Punctuation.
s/[¤]/*/g
s/[§]/*/g
s/[©]/(C)/g
s/[«]/"/g
s/[]/-/g
s/[®]/(R)/g
s/[°]/deg. /g
s/[´]/'/g
s/[»]/"/g
s/[÷]/\//g
s/[ǀ]/|/g
s/[ǃ]/!/g
s/[ʹ]/'/g
s/[ʺ]/"/g
s/[ʼ]/'/g
s/[˄]/^/g
s/[ˆ]/^/g
s/[ˈ]/'/g
s/[ˋ]/`/g
s/[ˍ]/_/g
s/[˜]/~/g
s/[։]/:/g
s/[׀]/|/g
s/[׃]/:/g
s/[٪]/%/g
s/[٭]/*/g
s/[‐]/-/g
s/[‑]/-/g
s/[‒]/--/g
s/[–]/--/g
s/[—]/--/g
s/[―]/--/g
s/[‖]/||/g
s/[‗]/_/g
s/[‘]/'/g
s/[’]/'/g
s/[‚]/,/g
s/[‛]/'/g
s/[“]/"/g
s/[”]/"/g
s/[„]/"/g
s/[‟]/"/g
s/[…]/.../g
s/[′]/'/g
s/[″]/"/g
s/[‴]/''/g
s/[‵]/`/g
s/[‶]/"/g
s/[‷]/''/g
s/[‸]/^/g
s/[‹]/</g
s/[›]/>/g
s/[‽]/?/g
s/[⁄]/\//g
s/[⁎]/*/g
s/[⁒]/%/g
s/[⁓]/~/g
s/[№]/No./g
s/[℗]/ (P)/g
s/[℠]/ (SM)/g
s/[™]/ (TM)/g
s/[−]/-/g
s/[∕]/\//g
s/[∖]/\\/g
s/[∗]/*/g
s/[∣]/|/g
s/[∶]/:/g
s/[∼]/~/g
s/[≤]/<=/g
s/[≥]/>=/g
s/[≦]/<=/g
s/[≧]/>=/g
s/[⌃]/^/g
s/[〈]/</g
s/[〉]/>/g
s/[◊]/*/g
s/[♯]/#/g
s/[✱]/*/g
s/[❘]/|/g
s/[❢]/!/g
s/[⟦]/[/g
s/[⟨]/</g
s/[⟩]/>/g
s/[⦃]/{/g
s/[⦄]/}/g
s/[〃]/"/g
s/[〈]/</g
s/[〉]/>/g
s/[〛]/]/g
s/[〜]/~/g
s/[〝]/"/g
s/[〞]/"/g
# Convert n-space separated sentences into two space separated
# sentences. This is still buggy as it fails to differentiate between
# periods used to end sentences, and periods used to shorten words like
# in "Mrs.".
s/\([^[:space:]]\{2\}[.!?][`'"]\?\)[[:space:]]\+/\1 /g
# Convert DOS style line ending to *nix style.
s/\r$//
s/\r/\n/g