Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
liuyug committed Feb 22, 2016
0 parents commit b3f374d
Show file tree
Hide file tree
Showing 4 changed files with 207 additions and 0 deletions.
11 changes: 11 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
=======
CHARSET
=======

GB2312
======
Chinese GB2312 character

ASCII
=====
ascii character
39 changes: 39 additions & 0 deletions ascii.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/usr/bin/env python
# -*- encoding:utf-8 -*-

import sys


def ascii_set():
char_begin = 32
char_end = 128
codes = range(char_begin, char_end)
return codes


def txt(output=sys.stdout):
codes = ascii_set()
header = ' | DEC HEX HTML CH'
ncol = 80 // len(header)
nrow = len(codes) // ncol
less = len(codes) % ncol
for x in range(0, ncol):
output.write(header)
output.write(' |\n')
for x in range(0, nrow):
for y in range(0, ncol):
code = codes[x + y * nrow]
char_line = ' | %3d 0x%02x &#%03d; %02c' % (code, code, code, chr(code))
output.write(char_line)
output.write(' |\n')
if less:
for x in range(less, 0, -1):
code = codes[-x]
char_line = ' | %3d 0x%02x &#%03d; %02c' % (code, code, code, chr(code))
output.write(char_line)
output.write(' |\n')
return


if __name__ == '__main__':
txt()
138 changes: 138 additions & 0 deletions gb2312.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
#!/usr/bin/env python
# -*- encoding:utf-8 -*-

import struct


class GB2312(object):
def __init__(self, errors=None):
self.charset = {}
nSec = 94
nPos = 94
for x in range(nSec):
sec = x + 1
for y in range(nPos):
pos = y + 1
# int
gb_code = (sec + 0xa0) << 8 | (pos + 0xa0)
# bytes
b_code = struct.pack('>1H', gb_code)
# unicode
ch = b_code.decode('gbk', errors='ignore')
if not ch and errors:
ch = errors
self.charset[gb_code] = ch

def __repr__(self):
return '<GB2312>'

def get_sections(self, sections=None):
if not isinstance(sections, list):
sections = range(1, 94 + 1)
char_set = []
nPos = 94
for sec in sections:
sec_set = []
for x in range(nPos):
pos = x + 1
i_code = (sec + 0xa0) << 8 | (pos + 0xa0)
sec_set.append(self.charset[i_code])
char_set.append(sec_set)
return char_set

def get_symbols(self):
return self.get_sections(range(1, 10))

def sp_search(self, chars):
"""section and postion"""
inverse_charset = dict(zip(self.charset.values(), self.charset.keys()))
codes = []
for ch in chars:
code = inverse_charset.get(ch)
sec = (code >> 8) - 0xa0
pos = (code & 0xff) - 0xa0
codes.append('%s%s' % (sec, pos))
return codes

def as_txt(self, errors=None):
err_ch = errors if errors else ''
char_set = self.get_sections(range(1, 94 + 1))
count = 0
lines = []
lines.append(' ' * 3 + ' '.join(['%02d' % (x + 1) for x in range(94)]))
for sec_set in char_set:
count += 1
lines.append('%02d ' % count + ' '.join(
[ch if ch else err_ch for ch in sec_set]
))
return '\n'.join(lines)

def as_html(self, errors=None):
err_ch = errors if errors else ''
char_set = self.get_sections(range(1, 94 + 1))
html = []
html.append('<!DOCTYPE html>')
html.append('<html>')
html.append('<head>')
html.append('<meta charset="UTF-8" />')
html.append('<title>CODE TABLE of GB2312-80</title>')
html.append('<style type="text/css">')
html.append('table {border-collapse:collapse;border-spacing:0;}')
html.append('td {border:1px solid green;padding:0.3em;text-align:center;}')
html.append('hr {border:width:75%;}')
html.append('</style>')
html.append('</head>')
html.append('<body>')
html.append('<h1>CODE TABLE of GB2312-80</h1>')
html.append('<p>Made by Yugang LIU</p>')
html.append('<hr />')
html.append('<ul>')
html.append('<li>s: section</li>')
html.append('<li>p: position</li>')
html.append('</ul>')
html.append('<p><code>code = (0xA0 + sec) << 8 + (0xA0 + pos)</code></p>')
html.append('<table>')
html.append(
'<tr>' +
'<td>s\p</td>' +
''.join(['<td>%02d</td>' % (x + 1) for x in range(94)]) +
'</tr>'
)
count = 0
for sec_set in char_set:
count += 1
html.append(
'<tr>' +
'<td>%02d</td>' % count +
''.join(['<td>%s</td>' % ch if ch else '<td>%s</td>' % err_ch for ch in sec_set]) +
'</tr>'
)
html.append('</table>')
html.append('</body>')
html.append('</html>')
return '\n'.join(html)


if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--output-html', action='store_true', help='output html table')
parser.add_argument('--output-txt', action='store_true', help='output txt table')
parser.add_argument('--output-table', action='store_true', help='output ascii table')
parser.add_argument('--sp-search', help='section and position for gb2312 character')
args = parser.parse_args()

gb2312 = GB2312()
if args.output_txt:
print(gb2312.as_txt(errors=u' ').encode('utf-8'))
if args.output_html:
print(gb2312.as_html().encode('utf-8'))
if args.output_table:
char_set = gb2312.get_sections(range(1, 94 + 1))
from asciitable import AsciiTable
a = AsciiTable(char_set, header=False)
print(a.table().encode('utf-8'))
if args.sp_search:
chars = args.sp_search.decode('utf-8')
print('GB2312 section and position:')
print(gb2312.sp_search(chars))
19 changes: 19 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/usr/bin/env python
# -*- encoding:utf-8 -*-

from distutils.core import setup


VERSION = "0.1.0"


setup(
name="charset",
version=VERSION,
description="charset for Chinese GB2312",
url="https://github.com/liuyug/charset",
license="BSD",
author="Yugang LIU",
author_email="liuyug@gmail.com",
scripts=['gb2312.py', 'ascii.py']
)

0 comments on commit b3f374d

Please sign in to comment.