# Hye-Shik Chang <16 Feb 2002>
# $Id: unijohab.py,v 1.2 2002/03/16 02:35:20 perky Exp $

import codecs

from korean.hangul import Jaeum, Moeum, ishangul, split, join
encmap, decmap = {}, {}

johab2uni_chosung = {
    u'\u115f': u'',         u'\u1100': Jaeum.G,     u'\u1101': Jaeum.GG,
    u'\u1102': Jaeum.N,     u'\u1103': Jaeum.D,     u'\u1104': Jaeum.DD,
    u'\u1105': Jaeum.R,     u'\u1106': Jaeum.M,     u'\u1107': Jaeum.B,
    u'\u1108': Jaeum.BB,    u'\u1109': Jaeum.S,     u'\u110a': Jaeum.SS,
    u'\u110b': Jaeum.A,     u'\u110c': Jaeum.J,     u'\u110d': Jaeum.JJ,
    u'\u110e': Jaeum.CH,    u'\u110f': Jaeum.K,     u'\u1110': Jaeum.T,
    u'\u1111': Jaeum.P,     u'\u1112': Jaeum.H
}
johab2uni_jungsung = {
    u'\u1160': u'',         u'\u1161': Moeum.A,     u'\u1162': Moeum.AE,
    u'\u1163': Moeum.YA,    u'\u1164': Moeum.YAE,   u'\u1165': Moeum.EO,
    u'\u1166': Moeum.E,     u'\u1167': Moeum.YEO,   u'\u1168': Moeum.YE,
    u'\u1169': Moeum.O,     u'\u116a': Moeum.WA,    u'\u116b': Moeum.WAE,
    u'\u116c': Moeum.WOE,   u'\u116d': Moeum.YO,    u'\u116e': Moeum.OO,
    u'\u116f': Moeum.WO,    u'\u1170': Moeum.WE,    u'\u1171': Moeum.WI,
    u'\u1172': Moeum.YU,    u'\u1173': Moeum.EU,    u'\u1174': Moeum.EUI,
    u'\u1175': Moeum.I
}
johab2uni_jongsung = {
    u'': u'',               u'\u11a8': Jaeum.G,     u'\u11a9': Jaeum.GG,
    u'\u11aa': Jaeum.GS,    u'\u11ab': Jaeum.N,     u'\u11ac': Jaeum.NJ,
    u'\u11ad': Jaeum.NH,    u'\u11ae': Jaeum.D,     u'\u11af': Jaeum.R,
    u'\u11b0': Jaeum.RG,    u'\u11b1': Jaeum.RM,    u'\u11b2': Jaeum.RB,
    u'\u11b3': Jaeum.RS,    u'\u11b4': Jaeum.RT,    u'\u11b5': Jaeum.RP,
    u'\u11b6': Jaeum.RH,    u'\u11b7': Jaeum.M,     u'\u11b8': Jaeum.B,
    u'\u11b9': Jaeum.BS,    u'\u11ba': Jaeum.S,     u'\u11bb': Jaeum.SS,
    u'\u11bc': Jaeum.A,     u'\u11bd': Jaeum.J,     u'\u11be': Jaeum.CH,
    u'\u11bf': Jaeum.K,     u'\u11c0': Jaeum.T,     u'\u11c1': Jaeum.P,
    u'\u11c2': Jaeum.H
}

uni2johab_chosung = {}
uni2johab_jungsung = {}
uni2johab_jongsung = {}
for k, v in johab2uni_chosung.items():
    uni2johab_chosung[v] = k
for k, v in johab2uni_jungsung.items():
    uni2johab_jungsung[v] = k
for k, v in johab2uni_jongsung.items():
    uni2johab_jongsung[v] = k


class Codec(codecs.Codec):

    # Unicode to character buffer
    def encode(self, data, errors='strict',
               supported_errors=('strict', 'ignore', 'replace')):
        global encmap

        if errors not in supported_errors:
            raise UnicodeError, "unknown error handling"
        buffer = []

        for c in data:
            if ishangul(c):
                cho, jung, jong = split(c) # all hangul can success
                buffer.append(
                   uni2johab_chosung[cho] +
                    uni2johab_jungsung[jung] +
                    uni2johab_jongsung[jong]
                )
            else:
                buffer.append(c)

        return (u''.join(buffer).encode('utf-8', errors), len(data))

    # character buffer to Unicode
    def decode(self, data, errors='strict',
               supported_errors=('strict', 'ignore', 'replace')):
        global decmap

        if errors not in supported_errors:
            raise UnicodeError, "unknown error handling"

        buffer = []
        data = unicode(data, 'utf-8', errors)
        size = len(data)
        p = 0
        while p < size:
            if not u'\u1100' <= data[p] <= u'\u11FF':
                buffer.append(data[p])
                p += 1
            else:
                c = data[p:p+3]
                try:
                    cho = johab2uni_chosung[c[0]]
                    jung = johab2uni_jungsung[c[1]]
                    if len(c)>2 and johab2uni_jongsung.has_key(c[2]):
                        jong = johab2uni_jongsung[c[2]]
                        p += 3 # this must locate end of this block
                    else:
                        jong = u''
                        p += 2 # too.
                except:
                    if errors == 'replace':
                        buffer.append(u'\uFFFD') # REPLACEMENT CHARACTER
                    elif errors == 'strict':
                        raise UnicodeError, "unexpected byte \\u%04x found" % ord(c[0])
                    p += 1
                else:
                    buffer.append(join([cho, jung, jong]))

        return (u''.join(buffer), size)


class StreamWriter(Codec, codecs.StreamWriter):
    pass

class StreamReader(Codec, codecs.StreamReader):
    pass
    # XXX: Temporarily None.

### encodings module API

def getregentry():
    return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
