#
#  kanjilib.py - an encoding conversion library for Japanese text processing
#                Tamito KAJIYAMA <27 May 2001>
#
#  NOTE: This file contains Japanese characters (EUC-JP).
#
#  $Id: kanjilib.py,v 1.3 2002/10/24 12:56:14 kajiyama Exp $
#
#  Copyright (c) 2001 Tamito KAJIYAMA.  All rights reserved.
#
#  Permission to use, copy, modify, and distribute this software and its
#  documentation for any purpose and without fee is hereby granted,
#  provided that the above copyright notice appear in all copies and that
#  both that copyright notice and this permission notice appear in
#  supporting documentation, and that the name of Tamito KAJIYAMA not be
#  used in advertising or publicity pertaining to distribution of the
#  software without specific, written prior permission.
#
#  TAMITO KAJIYAMA DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
#  INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
#  EVENT SHALL TAMITO KAJIYAMA BE LIABLE FOR ANY SPECIAL, INDIRECT OR
#  CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
#  USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
#  OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
#  PERFORMANCE OF THIS SOFTWARE.
#

import string

class KanjiError(ValueError):
    pass
error = KanjiError

# EUC-JP to Shift_JIS
def euc2sjis(s, error="strict"):
    if error not in ["strict", "replace", "ignore"]:
        raise KanjiError, "unknown error handling scheme"
    t = []
    i, j = 0, len(s)
    while i < j:
        c1 = ord(s[i])
        i = i + 1
        if c1 < 0x80:
            t.append(chr(c1))
            continue
        if i < j:
            c2 = ord(s[i])
            i = i + 1
        elif error == "strict":
            raise KanjiError, "truncated string"
        elif error == "replace":
            t.append("\x81\xac") # Shift_SJIS
            break
        elif error == "ignore":
            break
        if c1 == 0x8e: # Halfwidth Katakana
            t.append(chr(c2))
            continue
        if 0xa1 <= c1 <= 0xfe and 0xa1 <= c2 <= 0xfe:
            if c1 & 1:
                if c1 < 0xdf:
                    c1 = c1 / 2 + 0x31
                else:
                    c1 = c1 / 2 + 0x71
                if c2 < 0xe0:
                    c2 = c2 - 0x61
                else:
                    c2 = c2 - 0x60
            else:
                if c1 < 0xdf:
                    c1 = c1 / 2 + 0x30
                else:
                    c1 = c1 / 2 + 0x70
                c2 = c2 - 2
            t.append(chr(c1))
            t.append(chr(c2))
        elif error == "strict":
            raise KanjiError, "malformed string"
        elif error == "replace":
            t.append("\x81\xac") # Shift_SJIS
    return string.join(t, '')

# Shift_JIS to EUC-JP
def sjis2euc(s, error="strict"):
    if error not in ["strict", "replace", "ignore"]:
        raise KanjiError, "unknown error handling scheme"
    t = []
    i, j = 0, len(s)
    while i < j:
        c1 = ord(s[i])
        i = i + 1
        if c1 < 0x80:
            t.append(chr(c1))
            continue
        elif 0xa1 <= c1 <= 0xdf: # Halfwidth Katakana
            t.append("\x8e")
            t.append(chr(c1))            
            continue
        if i < j:
            c2 = ord(s[i])
            i = i + 1
        elif error == "strict":
            raise KanjiError, "truncated string"
        elif error == "replace":
            t.append("\xa2\xae") # EUC-JP
            break
        elif error == "ignore":
            break
        if c2 < 0x9f:
            if c1 < 0xe0:
                c1 = c1 * 2 - 0x61
            else:
                c1 = c1 * 2 - 0xe1
            if c2 < 0x7f:
                c2 = c2 + 0x61
            else:
                c2 = c2 + 0x60
        else:
            if c1 < 0xe0:
                c1 = c1 * 2 - 0x60
            else:
                c1 = c1 * 2 - 0xe0
            c2 = c2 + 2
        if 0xa1 <= c1 <= 0xfe and 0xa1 <= c2 <= 0xfe:
            t.append(chr(c1))
            t.append(chr(c2))
        elif error == "strict":
            raise KanjiError, "malformed string"
        elif error == "replace":
            t.append("\xa2\xae") # EUC-JP
    return string.join(t, '')

# Shift_JIS to ISO-2022-JP
def sjis2jis(s, error="strict"):
    if error not in ["strict", "replace", "ignore"]:
        raise KanjiError, "unknown error handling scheme"
    t = []
    kanji = 0
    i, j = 0, len(s)
    while i < j:
        if ord(s[i]) < 0x80:
            if kanji:
                t.append("\033(B")
                kanji = 0
            t.append(s[i])
            i = i + 1
        elif 0xa1 <= ord(s[i]) <= 0xdf:
            raise KanjiError, "unable to convert Halfwidth Katakana into JIS"
        else:
            if not kanji:
                t.append("\033$B")
                kanji = 1
            if i + 1 < j:
                c1, c2 = ord(s[i]), ord(s[i+1])
            elif error == "strict":
                raise KanjiError, "truncated string"
            elif error == "replace":
                t.append('".') # ISO-2022-JP
                break
            elif error == "ignore":
                break
            if c2 < 0x9f:
                if c1 < 0xe0:
                    c1 = (c1 * 2 - 0x61) & 0x7f
                else:
                    c1 = (c1 * 2 - 0xe1) & 0x7f
                if c2 < 0x7f:
                    c2 = (c2 + 0x61) & 0x7f
                else:
                    c2 = (c2 + 0x60) & 0x7f
            else:
                if c1 < 0xe0:
                    c1 = (c1 * 2 - 0x60) & 0x7f
                else:
                    c1 = (c1 * 2 - 0xe0) & 0x7f
                c2 = c2 - 0x7e
            if 0x21 <= c1 <= 0x7e and 0x21 <= c2 <= 0x7e:
                t.append(chr(c1))
                t.append(chr(c2))
            elif error == "strict":
                raise KanjiError, "malformed string"
            elif error == "replace":
                t.append('".') # ISO-2022-JP
            i = i + 2
    if kanji:
        t.append("\033(B")
    return string.join(t, '')

# EUC-JP to ISO-2022-JP
def euc2jis(s, error="strict"):
    if error not in ["strict", "replace", "ignore"]:
        raise KanjiError, "unknown error handling scheme"
    t = []
    kanji = 0
    i, j = 0, len(s)
    while i < j:
        if ord(s[i]) < 0x80:
            if kanji:
                t.append("\033(B")
                kanji = 0
            t.append(s[i])
            i = i + 1
        elif ord(s[i]) == 0x8e:
            raise KanjiError, "unable to convert Halfwidth Katakana into JIS"
        else:
            if not kanji:
                t.append("\033$B")
                kanji = 1
            if i + 1 < j:
                c1, c2 = ord(s[i]), ord(s[i+1])
            elif error == "strict":
                raise KanjiError, "truncated string"
            elif error == "replace":
                t.append('".') # ISO-2022-JP
                break
            elif error == "ignore":
                break
            if 0xa1 <= c1 <= 0xfe and 0xa1 <= c2 <= 0xfe:
                t.append(chr(c1 & 0x7f))
                t.append(chr(c2 & 0x7f))
            elif error == "strict":
                raise KanjiError, "malformed string"
            elif error == "replace":
                t.append('".') # ISO-2022-JP
            i = i + 2
    if kanji:
        t.append("\033(B")
    return string.join(t, '')

# ISO-2022-JP to EUC-JP
def jis2euc(s, error="strict"): # s should start and end with ASCII
    if error not in ["strict", "replace", "ignore"]:
        raise KanjiError, "unknown error handling scheme"
    t = []
    j = len(s)
    kanji = 0
    start = 0
    while 1:
        end = string.find(s, '\033', start)
        if end < 0:
            t.append(s[start:])
            break
        if kanji:
            for i in range(start, end, 2):
                if i + 1 < j:
                    c1, c2 = ord(s[i]), ord(s[i+1])
                elif error == "strict":
                    raise KanjiError, "truncated string"
                elif error == "replace":
                    t.append("\xa2\xae") # EUC-JP
                    break
                elif error == "ignore":
                    break
                if 0x21 <= c1 <= 0x7e and 0x21 <= c2 <= 0x7e:
                    t.append(chr(c1 | 0x80))
                    t.append(chr(c2 | 0x80))
                elif error == "strict":
                    raise KanjiError, "malformed string"
                elif error == "replace":
                    t.append("\xa2\xae") # EUC-JP
        else:
            t.append(s[start:end])
        if s[end:end+4] == '\033$(B':
            start = end + 4
            kanji = 1
        elif s[end:end+3] in ['\033$B', '\033$@']:
            start = end + 3
            kanji = 1
        elif s[end:end+3] in ['\033(B', '\033(J']:
            start = end + 3
            kanji = 0
        else:
            raise KanjiError, "unknown escape sequence at %d" % end
    return string.join(t, '')

# ISO-2022-JP to Shift_JIS
def jis2sjis(s, error="strict"): # s should start and end with ASCII
    if error not in ["strict", "replace", "ignore"]:
        raise KanjiError, "unknown error handling scheme"
    t = []
    j = len(s)
    kanji = 0
    start = 0
    while 1:
        end = string.find(s, '\033', start)
        if end < 0:
            t.append(s[start:])
            break
        if kanji:
            for i in range(start, end, 2):
                if i + 1 < j:
                    c1, c2 = ord(s[i]), ord(s[i+1])
                elif error == "strict":
                    raise KanjiError, "truncated string"
                elif error == "replace":
                    t.append("\x81\xac") # Shift_JIS
                    break
                elif error == "ignore":
                    break
                if 0x21 <= c1 <= 0x7e and 0x21 <= c2 <= 0x7e:
                    c1 = c1 | 0x80
                    c2 = c2 | 0x80
                    if c1 & 1:
                        if c1 < 0xdf:
                            c1 = c1 / 2 + 0x31
                        else:
                            c1 = c1 / 2 + 0x71
                        if c2 < 0xe0:
                            c2 = c2 - 0x61
                        else:
                            c2 = c2 - 0x60
                    else:
                        if c1 < 0xdf:
                            c1 = c1 / 2 + 0x30
                        else:
                            c1 = c1 / 2 + 0x70
                        c2 = c2 - 2
                    t.append(chr(c1))
                    t.append(chr(c2))
                elif error == "strict":
                    raise KanjiError, "malformed string"
                elif error == "replace":
                    t.append("\x81\xac") # Shift_JIS
        else:
            t.append(s[start:end])
        if s[end:end+4] == '\033$(B':
            start = end + 4
            kanji = 1
        elif s[end:end+3] in ['\033$B', '\033$@']:
            start = end + 3
            kanji = 1
        elif s[end:end+3] in ['\033(B', '\033(J']:
            start = end + 3
            kanji = 0
        else:
            raise KanjiError, "unknown escape sequence at %d" % end
    return string.join(t, '')

euc_han2zen_mapping = {
    "\x8e\xa1":         "\xa1\xa3", # ""
    "\x8e\xa2":         "\xa1\xd6", # ""
    "\x8e\xa3":         "\xa1\xd7", # ""
    "\x8e\xa4":         "\xa1\xa2", # ""
    "\x8e\xa5":         "\xa1\xa6", # ""
    "\x8e\xa6":         "\xa5\xf2", # ""
    "\x8e\xa7":         "\xa5\xa1", # ""
    "\x8e\xa8":         "\xa5\xa3", # ""
    "\x8e\xa9":         "\xa5\xa5", # ""
    "\x8e\xaa":         "\xa5\xa7", # ""
    "\x8e\xab":         "\xa5\xa9", # ""
    "\x8e\xac":         "\xa5\xe3", # ""
    "\x8e\xad":         "\xa5\xe5", # ""
    "\x8e\xae":         "\xa5\xe7", # ""
    "\x8e\xaf":         "\xa5\xc3", # ""
    "\x8e\xb0":         "\xa1\xbc", # ""
    "\x8e\xb1":         "\xa5\xa2", # ""
    "\x8e\xb2":         "\xa5\xa4", # ""
    "\x8e\xb3":         "\xa5\xa6", # ""
    "\x8e\xb4":         "\xa5\xa8", # ""
    "\x8e\xb5":         "\xa5\xaa", # ""
    "\x8e\xb6":         "\xa5\xab", # ""
    "\x8e\xb6\x8e\xde": "\xa5\xac", # ""
    "\x8e\xb7":         "\xa5\xad", # ""
    "\x8e\xb7\x8e\xde": "\xa5\xae", # ""
    "\x8e\xb8":         "\xa5\xaf", # ""
    "\x8e\xb8\x8e\xde": "\xa5\xb0", # ""
    "\x8e\xb9":         "\xa5\xb1", # ""
    "\x8e\xb9\x8e\xde": "\xa5\xb2", # ""
    "\x8e\xba":         "\xa5\xb3", # ""
    "\x8e\xba\x8e\xde": "\xa5\xb4", # ""
    "\x8e\xbb":         "\xa5\xb5", # ""
    "\x8e\xbb\x8e\xde": "\xa5\xb6", # ""
    "\x8e\xbc":         "\xa5\xb7", # ""
    "\x8e\xbc\x8e\xde": "\xa5\xb8", # ""
    "\x8e\xbd":         "\xa5\xb9", # ""
    "\x8e\xbd\x8e\xde": "\xa5\xba", # ""
    "\x8e\xbe":         "\xa5\xbb", # ""
    "\x8e\xbe\x8e\xde": "\xa5\xbc", # ""
    "\x8e\xbf":         "\xa5\xbd", # ""
    "\x8e\xbf\x8e\xde": "\xa5\xbe", # ""
    "\x8e\xc0":         "\xa5\xbf", # ""
    "\x8e\xc0\x8e\xde": "\xa5\xc0", # ""
    "\x8e\xc1":         "\xa5\xc1", # ""
    "\x8e\xc1\x8e\xde": "\xa5\xc2", # ""
    "\x8e\xc2":         "\xa5\xc4", # ""
    "\x8e\xc2\x8e\xde": "\xa5\xc5", # ""
    "\x8e\xc3":         "\xa5\xc6", # ""
    "\x8e\xc3\x8e\xde": "\xa5\xc7", # ""
    "\x8e\xc4":         "\xa5\xc8", # ""
    "\x8e\xc4\x8e\xde": "\xa5\xc9", # ""
    "\x8e\xc5":         "\xa5\xca", # ""
    "\x8e\xc6":         "\xa5\xcb", # ""
    "\x8e\xc7":         "\xa5\xcc", # ""
    "\x8e\xc8":         "\xa5\xcd", # ""
    "\x8e\xc9":         "\xa5\xce", # ""
    "\x8e\xca":         "\xa5\xcf", # ""
    "\x8e\xca\x8e\xde": "\xa5\xd0", # ""
    "\x8e\xca\x8e\xdf": "\xa5\xd1", # ""
    "\x8e\xcb":         "\xa5\xd2", # ""
    "\x8e\xcb\x8e\xde": "\xa5\xd3", # ""
    "\x8e\xcb\x8e\xdf": "\xa5\xd4", # ""
    "\x8e\xcc":         "\xa5\xd5", # ""
    "\x8e\xcc\x8e\xde": "\xa5\xd6", # ""
    "\x8e\xcc\x8e\xdf": "\xa5\xd7", # ""
    "\x8e\xcd":         "\xa5\xd8", # ""
    "\x8e\xcd\x8e\xde": "\xa5\xd9", # ""
    "\x8e\xcd\x8e\xdf": "\xa5\xda", # ""
    "\x8e\xce":         "\xa5\xdb", # ""
    "\x8e\xce\x8e\xde": "\xa5\xdc", # ""
    "\x8e\xce\x8e\xdf": "\xa5\xdd", # ""
    "\x8e\xcf":         "\xa5\xde", # ""
    "\x8e\xd0":         "\xa5\xdf", # ""
    "\x8e\xd1":         "\xa5\xe0", # ""
    "\x8e\xd2":         "\xa5\xe1", # ""
    "\x8e\xd3":         "\xa5\xe2", # ""
    "\x8e\xd4":         "\xa5\xe4", # ""
    "\x8e\xd5":         "\xa5\xe6", # ""
    "\x8e\xd6":         "\xa5\xe8", # ""
    "\x8e\xd7":         "\xa5\xe9", # ""
    "\x8e\xd8":         "\xa5\xea", # ""
    "\x8e\xd9":         "\xa5\xeb", # ""
    "\x8e\xda":         "\xa5\xec", # ""
    "\x8e\xdb":         "\xa5\xed", # ""
    "\x8e\xdc":         "\xa5\xef", # ""
    "\x8e\xdd":         "\xa5\xf3", # ""
    "\x8e\xde":         "\xa1\xab", # ""
    "\x8e\xdf":         "\xa1\xac", # ""
    }

def euc_han2zen(s, error="strict"):
    if error not in ["strict", "replace", "ignore"]:
        raise KanjiError, "unknown error handling scheme"
    t = []
    i = 0
    j = len(s)
    while i < j:
        if ord(s[i]) == 0x8e:
            try:
                try:
                    t.append(euc_han2zen_mapping[s[i:i+4]])
                    i = i + 4
                except KeyError:
                    t.append(euc_han2zen_mapping[s[i:i+2]])
                    i = i + 2
            except KeyError:
                if error == "strict":
                    raise KanjiError, "malformed string"
                elif error == "replace":
                    t.append("\xa2\xae") # EUC-JP
                    i = i + 2
                elif error == "ignore":
                    i = i + 2
        elif ord(s[i]) < 0x80:
            t.append(s[i])
            i = i + 1
        else:
            t.append(s[i:i+2])
            i = i + 2
    return string.join(t, '')

sjis_han2zen_mapping = {
    "\xa1":     "\x81\x42", # ""
    "\xa2":     "\x81\x75", # ""
    "\xa3":     "\x81\x76", # ""
    "\xa4":     "\x81\x41", # ""
    "\xa5":     "\x81\x45", # ""
    "\xa6":     "\x83\x92", # ""
    "\xa7":     "\x83\x40", # ""
    "\xa8":     "\x83\x42", # ""
    "\xa9":     "\x83\x44", # ""
    "\xaa":     "\x83\x46", # ""
    "\xab":     "\x83\x48", # ""
    "\xac":     "\x83\x83", # ""
    "\xad":     "\x83\x85", # ""
    "\xae":     "\x83\x87", # ""
    "\xaf":     "\x83\x62", # ""
    "\xb0":     "\x81\x5b", # ""
    "\xb1":     "\x83\x41", # ""
    "\xb2":     "\x83\x43", # ""
    "\xb3":     "\x83\x45", # ""
    "\xb4":     "\x83\x47", # ""
    "\xb5":     "\x83\x49", # ""
    "\xb6":     "\x83\x4a", # ""
    "\xb6\xde": "\x83\x4b", # ""
    "\xb7":     "\x83\x4c", # ""
    "\xb7\xde": "\x83\x4d", # ""
    "\xb8":     "\x83\x4e", # ""
    "\xb8\xde": "\x83\x4f", # ""
    "\xb9":     "\x83\x50", # ""
    "\xb9\xde": "\x83\x51", # ""
    "\xba":     "\x83\x52", # ""
    "\xba\xde": "\x83\x53", # ""
    "\xbb":     "\x83\x54", # ""
    "\xbb\xde": "\x83\x55", # ""
    "\xbc":     "\x83\x56", # ""
    "\xbc\xde": "\x83\x57", # ""
    "\xbd":     "\x83\x58", # ""
    "\xbd\xde": "\x83\x59", # ""
    "\xbe":     "\x83\x5a", # ""
    "\xbe\xde": "\x83\x5b", # ""
    "\xbf":     "\x83\x5c", # ""
    "\xbf\xde": "\x83\x5d", # ""
    "\xc0":     "\x83\x5e", # ""
    "\xc0\xde": "\x83\x5f", # ""
    "\xc1":     "\x83\x60", # ""
    "\xc1\xde": "\x83\x61", # ""
    "\xc2":     "\x83\x63", # ""
    "\xc2\xde": "\x83\x64", # ""
    "\xc3":     "\x83\x65", # ""
    "\xc3\xde": "\x83\x66", # ""
    "\xc4":     "\x83\x67", # ""
    "\xc4\xde": "\x83\x68", # ""
    "\xc5":     "\x83\x69", # ""
    "\xc6":     "\x83\x6a", # ""
    "\xc7":     "\x83\x6b", # ""
    "\xc8":     "\x83\x6c", # ""
    "\xc9":     "\x83\x6d", # ""
    "\xca":     "\x83\x6e", # ""
    "\xca\xde": "\x83\x6f", # ""
    "\xca\xdf": "\x83\x70", # ""
    "\xcb":     "\x83\x71", # ""
    "\xcb\xde": "\x83\x72", # ""
    "\xcb\xdf": "\x83\x73", # ""
    "\xcc":     "\x83\x74", # ""
    "\xcc\xde": "\x83\x75", # ""
    "\xcc\xdf": "\x83\x76", # ""
    "\xcd":     "\x83\x77", # ""
    "\xcd\xde": "\x83\x78", # ""
    "\xcd\xdf": "\x83\x79", # ""
    "\xce":     "\x83\x7a", # ""
    "\xce\xde": "\x83\x7b", # ""
    "\xce\xdf": "\x83\x7c", # ""
    "\xcf":     "\x83\x7d", # ""
    "\xd0":     "\x83\x7e", # ""
    "\xd1":     "\x83\x80", # ""
    "\xd2":     "\x83\x81", # ""
    "\xd3":     "\x83\x82", # ""
    "\xd4":     "\x83\x84", # ""
    "\xd5":     "\x83\x86", # ""
    "\xd6":     "\x83\x88", # ""
    "\xd7":     "\x83\x89", # ""
    "\xd8":     "\x83\x8a", # ""
    "\xd9":     "\x83\x8b", # ""
    "\xda":     "\x83\x8c", # ""
    "\xdb":     "\x83\x8d", # ""
    "\xdc":     "\x83\x8f", # ""
    "\xdd":     "\x83\x93", # ""
    "\xde":     "\x81\x4a", # ""
    "\xdf":     "\x81\x4b", # ""
    }

def sjis_han2zen(s, error="strict"):
    if error not in ["strict", "replace", "ignore"]:
        raise KanjiError, "unknown error handling scheme"
    t = []
    i = 0
    j = len(s)
    while i < j:
        if 0xa1 <= ord(s[i]) <= 0xdf:
            try:
                t.append(sjis_han2zen_mapping[s[i:i+2]])
                i = i + 2
            except KeyError:
                t.append(sjis_han2zen_mapping[s[i]])
                i = i + 1
        elif ord(s[i]) < 0x80:
            t.append(s[i])
            i = i + 1
        else:
            t.append(s[i:i+2])
            i = i + 2
    return string.join(t, '')

# Utility functions

def to_euc(s, error="strict"):
    if error not in ["strict", "replace", "ignore"]:
        raise KanjiError, "unknown error handling scheme"
    if is_euc(s):
        return s
    elif is_sjis(s):
        conv = sjis2euc
    else:
        conv = jis2euc
    return conv(s, error)

def to_sjis(s, error="strict"):
    if error not in ["strict", "replace", "ignore"]:
        raise KanjiError, "unknown error handling scheme"
    if is_sjis(s):
        return s
    elif is_euc(s):
        conv = euc2sjis
    else:
        conv = jis2sjis
    return conv(s, error)

def to_jis(s, error="strict"):
    if error not in ["strict", "replace", "ignore"]:
        raise KanjiError, "unknown error handling scheme"
    if is_euc(s):
        conv = euc2jis
    elif is_sjis(s):
        conv = sjis2jis
    else:
        return s
    return conv(s, error)

def is_euc(s): # return 1 if EUC-JP
    i, j = 0, len(s)
    while i < j:
        c = ord(s[i])
        i = i + 1
        if c < 0x80:
            continue
        if 0xa1 <= c <= 0xdf or 0xf0 <= c <= 0xfe:
            return 1
        i = i + 1
    return 0

def is_sjis(s): # return 1 if Shift_JIS
    i, j = 0, len(s)
    while i < j:
        c = ord(s[i])
        i = i + 1
        if c < 0x80:
            continue
        if 0x80 <= c <= 0x9f:
            return 1
        i = i + 1
    return 0

def han2zen(s, error="strict"):
    if error not in ["strict", "replace", "ignore"]:
        raise KanjiError, "unknown error handling scheme"
    if is_euc(s):
        return euc_han2zen(s, error)
    elif is_sjis(s):
        return sjis_han2zen(s, error)
    else:
        return s

aliases = {
    "eucjp":     "EUC-JP",
    "xeucjp":    "EUC-JP",
    "ujis":      "EUC-JP",
    "shiftjis":  "Shift_JIS",
    "sjis":      "Shift_JIS",
    "xshiftjis": "Shift_JIS",
    "xsjis":     "Shift_JIS",
    "iso2022jp": "ISO-2022-JP",
    "jis7":      "ISO-2022-JP",
    }
    
def unify_encoding_name(enc):
    enc = string.replace(enc, "-", "") # strip hyphens
    enc = string.replace(enc, "_", "") # strip underscores
    try:
        return aliases[string.lower(enc)]
    except KeyError:
        return None

_converters = {
    ("EUC-JP",      "EUC-JP"):      lambda s, e="": s,
    ("EUC-JP",      "Shift_JIS"):   euc2sjis,
    ("EUC-JP",      "ISO-2022-JP"): euc2jis,
    ("Shift_JIS",   "EUC-JP"):      sjis2euc,
    ("Shift_JIS",   "Shift_JIS"):   lambda s, e="": s,
    ("Shift_JIS",   "ISO-2022-JP"): sjis2jis,
    ("ISO-2022-JP", "EUC-JP"):      jis2euc,
    ("ISO-2022-JP", "Shift_JIS"):   jis2sjis,
    ("ISO-2022-JP", "ISO-2022-JP"): lambda s, e="": s,
    }

def get_converter(src_enc, dst_enc):
    return _converters.get((unify_encoding_name(src_enc),
                            unify_encoding_name(dst_enc)))

_han2zen_converters = {
    "EUC-JP":      euc_han2zen,
    "Shift_JIS":   sjis_han2zen,
    "ISO-2022-JP": lambda s, e="": s,
    }

def get_han2zen_converter(enc):
    return _han2zen_converters.get(unify_encoding_name(enc))

def euc_split(s):
    buffer = []
    i, j = 0, len(s)
    while i < j:
        if s[i] < "\x80":
            buffer.append(s[i])
            i = i + 1
        elif s[i] == "\x8f":
            buffer.append(s[i:i+3])
            i = i + 3
        else:
            buffer.append(s[i:i+2])
            i = i + 2
    return buffer

def sjis_split(s):
    buffer = []
    i, j = 0, len(s)
    while i < j:
        if s[i] < "\x80" or "\xa1" <= s[i] <= "\xdf":
            buffer.append(s[i])
            i = i + 1
        else:
            buffer.append(s[i:i+2])
            i = i + 2
    return buffer
