Index: less/cmdbuf.c diff -u less/cmdbuf.c:1.44 less/cmdbuf.c:1.46 --- less/cmdbuf.c:1.44 Sun Sep 4 13:52:54 2005 +++ less/cmdbuf.c Sun Sep 4 23:36:22 2005 @@ -142,7 +142,7 @@ if (mp == NULL) { char *s = NULL; - mp = new_multi(); + mp = new_multibuf(); s = getenv("JLESSKEYCHARSET"); if (s == NULL) s = DEFKEYCHARSET; @@ -150,7 +150,7 @@ right_codeset_of_charset(s)); init_priority(mp); } - init_multi(mp); + init_multibuf(mp); #endif } @@ -383,11 +383,12 @@ #if ISO if (in_mca()) { - char *cbuf; - CHARSET *csbuf; + M_BUFDATA mbd; + char *p; int i, j; - multi_buffering(mp, c, NULL, &cbuf, &csbuf, &i, NULL); + multi_parse(mp, c, NULL_POSITION, &mbd); + i = mbd.byte; if (i > 0) for ((s = &cmdbuf[strlen_cs(cmdbuf, cmdcs)]), t = &cmdcs[strlen_cs(cmdbuf, cmdcs)]; @@ -398,15 +399,15 @@ } for (j = 0; j < i; j++) { - cp[j] = cbuf[j]; - csp[j] = csbuf[j]; + cp[j] = mbd.cbuf[j]; + csp[j] = mbd.csbuf[j]; } - cbuf = &cp[i]; + p = &cp[i]; /* * Reprint the tail of the line from the inserted char. */ cmd_repaint(cp); - while (cp < cbuf) + while (cp < p) cmd_right(); return (CC_OK); } Index: less/edit.c diff -u less/edit.c:1.27 less/edit.c:1.28 --- less/edit.c:1.27 Tue Aug 30 22:45:49 2005 +++ less/edit.c Sun Sep 4 22:25:54 2005 @@ -369,7 +369,7 @@ set_open(curr_ifile); /* File has been opened */ get_pos(curr_ifile, &initial_scrpos); #if ISO - init_multi(get_mulbuf(curr_ifile)); + init_multibuf(get_mulbuf(curr_ifile)); #endif new_file = TRUE; ch_init(f, chflags); Index: less/ifile.c diff -u less/ifile.c:1.22 less/ifile.c:1.23 --- less/ifile.c:1.22 Tue Aug 30 22:45:49 2005 +++ less/ifile.c Sun Sep 4 22:25:54 2005 @@ -133,7 +133,7 @@ p->h_hold = 0; p->h_filestate = NULL; #if ISO - p->h_mp = new_multi(); + p->h_mp = new_multibuf(); init_priority(p->h_mp); #endif link_ifile(p, prev); Index: less/input.c diff -u less/input.c:1.33 less/input.c:1.34 --- less/input.c:1.33 Sun Sep 4 19:52:57 2005 +++ less/input.c Sun Sep 4 23:36:22 2005 @@ -57,10 +57,8 @@ int blankline; int endline; #if ISO - char *cbuf; - CHARSET *csbuf; - int i; - POSITION pos; + MULBUF* mp = get_mulbuf(curr_ifile); + M_BUFDATA mbd; int ret; #endif @@ -90,7 +88,7 @@ prewind(); plinenum(curr_pos); #if ISO - multi_start_buffering(get_mulbuf(curr_ifile), curr_pos); + multi_start_buffering(mp, curr_pos); #endif (void) ch_seek(curr_pos); @@ -112,8 +110,8 @@ if (c == '\n' || c == EOI) { #if ISO - multi_buffering(get_mulbuf(curr_ifile), -1, NULL, &cbuf, &csbuf, &i, &pos); - ret = pappend_multi(cbuf, csbuf, i, pos); + multi_flush(mp, &mbd); + (void) pappend_multi(&mbd); #endif /* * End of the line. @@ -127,9 +125,8 @@ * Append the char to the line and get the next char. */ #if ISO - pos = ch_tell() - 1; - multi_buffering(get_mulbuf(curr_ifile), c, &pos, &cbuf, &csbuf, &i, &pos); - ret = pappend_multi(cbuf, csbuf, i, pos); + multi_parse(mp, c, ch_tell()-1, &mbd); + ret = pappend_multi(&mbd); #else ret = pappend(c, control_char(c) ? WRONGCS : ASCII, 1, ch_tell()-1); #endif @@ -146,11 +143,10 @@ c = ch_forw_get(); while (c != '\n' && c != EOI) { - multi_parsing(get_mulbuf(curr_ifile), - c); + multi_parse(mp, c, NULL_POSITION, NULL); c = ch_forw_get(); } - multi_parsing(get_mulbuf(curr_ifile), -1); + multi_discard(mp); #else do { @@ -163,8 +159,8 @@ } else { #if ISO - multi_parsing(get_mulbuf(curr_ifile), -1); - new_pos = pos; + multi_discard(mp); + new_pos = mbd.pos; #else new_pos = ch_tell() - 1; #endif @@ -212,10 +208,8 @@ int c; int endline; #if ISO - char *cbuf; - CHARSET *csbuf; - int i; - POSITION pos; + MULBUF* mp = get_mulbuf(curr_ifile); + M_BUFDATA mbd; int ret; #endif @@ -319,7 +313,7 @@ prewind(); plinenum(new_pos); #if ISO - multi_start_buffering(get_mulbuf(curr_ifile), new_pos); + multi_start_buffering(mp, new_pos); #endif (void) ch_seek(new_pos); @@ -335,23 +329,22 @@ if (c == '\n') { #if ISO - multi_buffering(get_mulbuf(curr_ifile), -1, NULL, &cbuf, &csbuf, &i, &pos); - ret = pappend_multi(cbuf, csbuf, i, pos); + multi_flush(mp, &mbd); + (void) pappend_multi(&mbd); #endif endline = TRUE; break; } #if ISO - pos = ch_tell() - 1; - multi_buffering(get_mulbuf(curr_ifile), c, &pos, &cbuf, &csbuf, &i, &pos); - ret = pappend_multi(cbuf, csbuf, i, pos); + multi_parse(mp, c, ch_tell()-1, &mbd); + ret = pappend_multi(&mbd); #else ret = pappend(c, control_char(c) ? WRONGCS : ASCII, 1, ch_tell()-1); #endif if (ret != 0) { #if ISO - multi_parsing(get_mulbuf(curr_ifile), -1); + multi_discard(mp); #endif /* * Got a full printable line, but we haven't @@ -366,9 +359,9 @@ } #if ISO pdone(0); - i = ch_tell() - pos; - new_pos -= i; - while (--i >= 0) + ret = ch_tell() - mbd.pos; + new_pos -= ret; + while (--ret >= 0) ch_back_get(); #else pdone(0); @@ -379,7 +372,7 @@ } } while (new_pos < curr_pos); #if ISO - multi_parsing(get_mulbuf(curr_ifile), -1); + multi_discard(mp); #endif pdone(endline); Index: less/less.nro diff -u less/less.nro:1.51 less/less.nro:1.53 --- less/less.nro:1.51 Sun Sep 4 19:13:38 2005 +++ less/less.nro Tue Sep 6 07:18:26 2005 @@ -1,4 +1,4 @@ -.TH LESS 1 "Version 382+iso258: 04 Sep 2005" +.TH LESS 1 "Version 382+iso259: 06 Sep 2005" .SH NAME less \- opposite of more .SH SYNOPSIS Index: less/lesskey.nro diff -u less/lesskey.nro:1.25 less/lesskey.nro:1.27 --- less/lesskey.nro:1.25 Sun Sep 4 19:13:38 2005 +++ less/lesskey.nro Tue Sep 6 07:18:27 2005 @@ -1,4 +1,4 @@ -.TH LESSKEY 1 "Version 382+iso258: 04 Sep 2005" +.TH LESSKEY 1 "Version 382+iso259: 06 Sep 2005" .SH NAME lesskey \- specify key bindings for less .SH SYNOPSIS Index: less/line.c diff -u less/line.c:1.92 less/line.c:1.95 --- less/line.c:1.92 Sun Sep 4 17:41:18 2005 +++ less/line.c Tue Sep 6 07:27:57 2005 @@ -619,12 +619,13 @@ * Returns 0 if ok, 1 if couldn't fit in buffer. */ public int -pappend_multi(cbuf, csbuf, byte, pos) - char *cbuf; - CHARSET *csbuf; - int byte; - POSITION pos; +pappend_multi(mbd) + M_BUFDATA *mbd; { + char *cbuf = mbd->cbuf; + CHARSET *csbuf = mbd->csbuf; + int byte = mbd->byte; + POSITION pos = mbd->pos; int r; int saved_curr; int saved_column; @@ -634,6 +635,8 @@ int saved_cshift; int i; + if (byte == 0) + return (0); if (pendc) { if (do_append(pendc, control_char(pendc) ? WRONGCS : Index: less/multi.c diff -u less/multi.c:1.127 less/multi.c:1.135 --- less/multi.c:1.127 Sun Sep 4 19:07:19 2005 +++ less/multi.c Mon Sep 5 17:50:06 2005 @@ -52,8 +52,7 @@ #if ISO -static void reject_first_byte(); -static void rebuffering_multi(); +static void multi_reparse(); #if JAPANESE @@ -170,8 +169,25 @@ int icharset; /* Last non ASCII character set of input */ /* - * Buffers to keep all bytes of a multi-bytes character until it is - * proved to be right sequence. + * Small buffers to hold all parsing bytes of multi-byte characters. + * + * multi_parse() function receive a sequence of byte and buffer it. + * Each time multi_parse() recognize full data sequence to represent + * one character, it converts the data into internal data and returns + * converted data. + * + * Caller must buffer it somewhere and output it using outbuf() of + * outchar(). Those output functions() converts internal data into + * appropriate data stream for choosen output device. + * + * As internal data, we use char[] and CHARSET[] to keep byte and + * additional information, respectively. We choose ISO-2022 style + * data format as our internal data format because it is most easy + * to work with. It has completely separated planes for each + * character set. This helps code conversion and others alot. + * For example, we don't need to work to separate Chinese and + * Japanese because they are separated from the beginning in ISO-2022 + * although UTF-8 uses only single plane with all CJK character sets. */ /* * Buffer for input/parsing @@ -179,18 +195,22 @@ m_position lastpos; /* position of last byte */ m_position startpos; /* position of first byte buffered */ unsigned char inbuf[20]; + m_position laststartpos; /* position of first byte buffered last time */ + int lastsg; /* last single-shifted plane (ms->sg) */ /* - * Second buffer. - * All recognized data is buffered with character set tag. + * Buffer for internalized/converted data */ - unsigned char multiint[10]; /* Buffer for recognized data */ - CHARSET multics[10]; /* Buffer for character set marks */ + unsigned char multiint[10]; /* Byte data */ + CHARSET multics[10]; /* Character set data (no UJIS/SJIS/UTF */ + /* because all of them are converted into */ + /* internal data format) */ int intindex; /* Index of multiint */ - m_position lastesqpos; /* Last escape sequence position */ - /* (point FIN char) */ }; #define INBUF(mp) ((mp)->inbuf[(mp)->lastpos%sizeof((mp)->inbuf)]) +#define INBUF0(mp) ((mp)->inbuf[(mp)->startpos%sizeof((mp)->inbuf)]) +#define INBUF1(mp) ((mp)->inbuf[((mp)->startpos+1)%sizeof((mp)->inbuf)]) +#define INBUF2(mp) ((mp)->inbuf[((mp)->startpos+2)%sizeof((mp)->inbuf)]) #define INBUFI(mp,i) ((mp)->inbuf[(i)%sizeof((mp)->inbuf)]) static int code_length(mp, cs) @@ -207,13 +227,13 @@ #if JAPANESE switch (CS2CHARSET(cs)) { case UJIS: - c = INBUFI(mp, mp->startpos); + c = INBUF0(mp); if (ISUJISKANJI1(c)) return 2; if (ISUJISKANA1(c)) return 2; if (ISUJISKANJISUP1(c)) return 3; return 1; case SJIS: - c = INBUFI(mp, mp->startpos); + c = INBUF0(mp); if (ISSJISKANJI1(c)) return 2; if (ISSJISKANA(c)) return 1; return 1; @@ -247,7 +267,7 @@ static void noconv1(mp) MULBUF *mp; { - mp->multiint[mp->intindex] = INBUFI(mp, mp->startpos); + mp->multiint[mp->intindex] = INBUF0(mp); mp->multics[mp->intindex] = ASCII; mp->intindex++; mp->startpos++; @@ -260,7 +280,7 @@ static void wrongcs1(mp) MULBUF *mp; { - mp->multiint[mp->intindex] = INBUFI(mp, mp->startpos); + mp->multiint[mp->intindex] = INBUF0(mp); mp->multics[mp->intindex] = WRONGCS; mp->intindex++; mp->startpos++; @@ -362,7 +382,7 @@ wrongcs1(mp); } else { wrongcs1(mp); - rebuffering_multi(); + multi_reparse(mp); } return; } else if ((c & 0x7f) == 0x20) { @@ -374,7 +394,7 @@ noconv1(mp); } else { wrongcs1(mp); - rebuffering_multi(); + multi_reparse(mp); } return; } @@ -415,7 +435,7 @@ */ wrongchar(mp); mp->startpos = pos; - rebuffering_multi(mp); + multi_reparse(mp); } } @@ -430,15 +450,15 @@ if (mp->lastpos - mp->startpos + 1 == 1) { /* do nothing */ } else if (mp->lastpos - mp->startpos + 1 == 2) { - if (ISUJISKANA(INBUFI(mp, mp->startpos), INBUF(mp))) { - mp->multiint[mp->intindex] = INBUF(mp) & 0x7f; + if (ISUJISKANA(INBUF0(mp), INBUF1(mp))) { + mp->multiint[mp->intindex] = INBUF1(mp) & 0x7f; mp->multics[mp->intindex] = mp->cs; mp->intindex += 1; mp->startpos = mp->lastpos + 1; - } else if (ISUJISKANJI(INBUFI(mp, mp->startpos), INBUF(mp))) { - mp->multiint[mp->intindex] = INBUFI(mp, mp->startpos); + } else if (ISUJISKANJI(INBUF0(mp), INBUF1(mp))) { + mp->multiint[mp->intindex] = INBUF0(mp); mp->multics[mp->intindex] = UJIS; - mp->multiint[mp->intindex + 1] = INBUF(mp); + mp->multiint[mp->intindex + 1] = INBUF1(mp); mp->multics[mp->intindex + 1] = REST_MASK | UJIS; /* @@ -447,9 +467,9 @@ if (chisvalid_cs(&mp->multiint[mp->intindex], &mp->multics[mp->intindex])) { /* JIS X 0208:1997 */ - mp->multiint[mp->intindex] = mp->multiint[0] & 0x7f; + mp->multiint[mp->intindex] &= 0x7f; mp->multics[mp->intindex] = mp->cs; - mp->multiint[mp->intindex + 1] = mp->multiint[1] & 0x7f; + mp->multiint[mp->intindex + 1] &= 0x7f; mp->multics[mp->intindex + 1] = REST_MASK | mp->cs; mp->intindex += 2; mp->startpos = mp->lastpos + 1; @@ -459,17 +479,16 @@ */ wrongchar(mp); mp->startpos = mp->lastpos + 1; - rebuffering_multi(mp); + multi_reparse(mp); } } } else if (mp->lastpos - mp->startpos + 1 == 3 && - ISUJISKANJISUP(INBUFI(mp, mp->startpos), - INBUFI(mp, mp->startpos + 1), INBUF(mp))) { - mp->multiint[mp->intindex] = INBUFI(mp, mp->startpos); + ISUJISKANJISUP(INBUF0(mp), INBUF1(mp), INBUF2(mp))) { + mp->multiint[mp->intindex] = INBUF0(mp); mp->multics[mp->intindex] = UJIS; - mp->multiint[mp->intindex + 1] = INBUFI(mp, mp->startpos + 1); + mp->multiint[mp->intindex + 1] = INBUF1(mp); mp->multics[mp->intindex + 1] = REST_MASK | UJIS; - mp->multiint[mp->intindex + 2] = INBUF(mp); + mp->multiint[mp->intindex + 2] = INBUF2(mp); mp->multics[mp->intindex + 2] = REST_MASK | UJIS; /* @@ -500,19 +519,19 @@ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #endif }; - c1 = mp->multiint[1] & 0x7f; + c1 = mp->multiint[mp->intindex + 1] & 0x7f; if (table[c1] != 0) { /* JIS X 0213:2000 plane 2 */ if (output == jis) { /* JIS cannot output JIS X 0213:2000 plane 2 */ wrongcs1(mp); - rebuffering_multi(mp); + multi_reparse(mp); } else { mp->multiint[mp->intindex] = c1; mp->multics[mp->intindex] = JISX0213KANJI2; mp->multiint[mp->intindex + 1] = - mp->multiint[2] & 0x7f; + mp->multiint[mp->intindex + 2] & 0x7f; mp->multics[mp->intindex + 1] = REST_MASK | JISX0213KANJI2; mp->intindex += 2; @@ -523,12 +542,12 @@ if (output == sjis || output == jis) { /* SJIS cannot output JIS X 0212:1990 */ wrongcs1(mp); - rebuffering_multi(mp); + multi_reparse(mp); } else { mp->multiint[mp->intindex] = c1; mp->multics[mp->intindex] = mp->cs; mp->multiint[mp->intindex + 1] = - mp->multiint[2] & 0x7f; + mp->multiint[mp->intindex + 2] & 0x7f; mp->multics[mp->intindex + 1] = REST_MASK | mp->cs; mp->intindex += 2; @@ -538,11 +557,11 @@ } else { wrongchar(mp); mp->startpos = mp->lastpos + 1; - rebuffering_multi(mp); + multi_reparse(mp); } } else { wrongcs1(mp); - rebuffering_multi(mp); + multi_reparse(mp); } } @@ -562,10 +581,10 @@ mp->startpos = mp->lastpos + 1; } } else if (mp->lastpos - mp->startpos + 1 == 2 && - ISSJISKANJI(INBUFI(mp, mp->startpos), INBUF(mp))) { - mp->multiint[mp->intindex] = INBUFI(mp, mp->startpos); + ISSJISKANJI(INBUF0(mp), INBUF1(mp))) { + mp->multiint[mp->intindex] = INBUF0(mp); mp->multics[mp->intindex] = SJIS; - mp->multiint[mp->intindex + 1] = INBUF(mp); + mp->multiint[mp->intindex + 1] = INBUF1(mp); mp->multics[mp->intindex + 1] = REST_MASK | SJIS; /* @@ -595,7 +614,7 @@ #endif }; - c1 = table[INBUFI(mp, mp->startpos) & 0x7f]; + c1 = table[INBUF0(mp) & 0x7f]; c2 = INBUF(mp) - ((unsigned char)INBUF(mp) >= 0x80 ? 1 : 0); c3 = c2 >= 0x9e; if (c1 < 0x80) { @@ -614,7 +633,7 @@ if (output == jis) { /* JIS cannot output JIS X 0213:2000 plane 2 */ wrongcs1(mp); - rebuffering_multi(mp); + multi_reparse(mp); } else { if (c1 > 0xA0) { /* row 3-4, 13-14, and 79-94 */ @@ -649,11 +668,11 @@ */ wrongchar(mp); mp->startpos = mp->lastpos + 1; - rebuffering_multi(mp); + multi_reparse(mp); } } else { wrongcs1(mp); - rebuffering_multi(mp); + multi_reparse(mp); } } #endif @@ -711,7 +730,7 @@ mp->cs != JISX0208_90KANJI && mp->cs != JISX0213KANJI1) { wrongcs1(mp); - rebuffering_multi(mp); + multi_reparse(mp); return; } /* UJIS cannot output regular ISO2022 except JIS */ @@ -725,7 +744,7 @@ mp->cs != JISX0213KANJI1 && mp->cs != JISX0213KANJI2) { wrongcs1(mp); - rebuffering_multi(mp); + multi_reparse(mp); return; } /* SJIS cannot output JISX0212 or ISO2022 */ @@ -738,7 +757,7 @@ mp->cs != JISX0213KANJI1 && mp->cs != JISX0213KANJI2) { wrongcs1(mp); - rebuffering_multi(mp); + multi_reparse(mp); return; } @@ -758,7 +777,7 @@ if (mp->io.right == japanese) { mp->sequence_counter++; if (mp->sequence_counter % 2 == 1 && - INBUFI(mp, mp->startpos) != 0xa4) /* ???? */ + INBUF0(mp) != 0xa4) /* ???? */ { mp->sequence_counter = 0; } @@ -790,26 +809,26 @@ #endif if (c < 0x20) { wrongcs1(mp); - rebuffering_multi(mp); + multi_reparse(mp); return; } else if (mp->cs != ASCII && (c <= 0x7f || (mp->io.right == iso8 && 0xa0 <= c && c <= 0xff))) { if (mp->cs != FINDCS(mp, c)) { wrongcs1(mp); - rebuffering_multi(mp); + multi_reparse(mp); } else { internalize_iso(mp); } return; } else if (control_char(c)) { wrongcs1(mp); - rebuffering_multi(mp); + multi_reparse(mp); return; } #if JAPANESE if (mp->lastpos - mp->startpos + 1 == 2) { - int c0 = INBUFI(mp, mp->startpos); + int c0 = INBUF0(mp); if (mp->priority == sjis && ISSJISKANJI(c0, c)) { #if UJIS0213 mp->cs = JISX0213KANJI1; @@ -875,8 +894,7 @@ } else if (mp->lastpos - mp->startpos + 1 == 3 && (mp->priority == ujis || mp->io.right == ujis || mp->io.right == japanese) && - ISUJISKANJISUP(INBUFI(mp, mp->startpos), - INBUFI(mp, mp->startpos + 1), c)) { + ISUJISKANJISUP(INBUF0(mp), INBUF1(mp), c)) { mp->cs = JISX0212KANJISUP; mp->priority = ujis; mp->icharset = UJIS; @@ -885,7 +903,7 @@ } #endif wrongcs1(mp); - rebuffering_multi(mp); + multi_reparse(mp); } /* @@ -912,7 +930,6 @@ *plane = (mp->ms->irr ? IRR2CS(mp->ms->irr) : 0) | TYPE2CS(type) | FT2CS(c); mp->ms->irr = 0; mp->eseq = NOESC; - mp->lastesqpos = mp->lastpos; return (0); } } else if (0x30 <= c && c <= 0x7e) { @@ -922,7 +939,6 @@ *plane = (mp->ms->irr ? IRR2CS(mp->ms->irr) : 0) | TYPE2CS(type) | FT2CS(c); mp->ms->irr = 0; mp->eseq = NOESC; - mp->lastesqpos = mp->lastpos; return (0); } return (-1); @@ -935,7 +951,6 @@ if (0x40 <= c && c <= 0x7e) { mp->ms->irr = CODE2IRR(c); mp->eseq = NOESC; - mp->lastesqpos = mp->lastpos; return (0); } return (-1); @@ -1099,7 +1114,6 @@ assert(0); } if (mp->eseq == NOESC) { - mp->lastesqpos = mp->lastpos; fix_status_for_escape_sequence(mp); mp->startpos = mp->lastpos + 1; return (0); @@ -1110,9 +1124,8 @@ mp->eseq = NOESC; fix_status_for_escape_sequence(mp); } - mp->lastesqpos = mp->startpos; wrongcs1(mp); - rebuffering_multi(mp); + multi_reparse(mp); return (0); wrongone: assert(mp->eseq == NOESC); @@ -1157,7 +1170,7 @@ break; } } - mp = new_multi(); + mp = new_multibuf(); init_priority(mp); while (*name) { if (*name == '\\' && @@ -1239,7 +1252,7 @@ #endif } -MULBUF *new_multi() +MULBUF *new_multibuf() { MULBUF *mp = (MULBUF*) ecalloc(1, sizeof(MULBUF)); mp->io.left = def_left; @@ -1248,15 +1261,17 @@ mp->rotation_io_right = 0; mp->eseq = NOESC; mp->ms = (struct m_status*) ecalloc(1, sizeof(struct m_status)); - init_multi(mp); + init_multibuf(mp); return (mp); } -void clear_multi(mp) +void clear_multibuf(mp) MULBUF *mp; { mp->lastpos = M_NULL_POS; mp->startpos = 0; + mp->laststartpos = 0; + mp->lastsg = WRONGPLANE; mp->intindex = 0; } @@ -1273,24 +1288,20 @@ ms->irr = 0; } -void init_multi(mp) +void init_multibuf(mp) MULBUF *mp; { -#if 0 - fprintf(stderr, "init_multi: %d, %d, %d, %d, %d, %d\n", mp->startpos, mp->lastpos, mp->lastesqpos); -#endif + mp->cs = ASCII; + init_ms(mp->ms); if (mp->eseq != NOESC) { mp->eseq = NOESC; - fix_status_for_escape_sequence(mp); } - mp->cs = ASCII; - init_ms(mp->ms); + fix_status_for_escape_sequence(mp); #if JAPANESE mp->sequence_counter = 0; #endif mp->icharset = ASCII; - clear_multi(mp); - mp->lastesqpos = M_NULL_POS; + clear_multibuf(mp); } /* @@ -1314,9 +1325,12 @@ * If a character was detected in internalize(), * clean sg since single shift affect only one character. */ - if (last_startpos != mp->startpos && mp->ms->sg != WRONGPLANE) { - mp->ms->sg = WRONGPLANE; - fix_status_for_escape_sequence(mp); + if (last_startpos != mp->startpos) { + mp->lastsg = mp->ms->sg; + if (mp->ms->sg != WRONGPLANE) { + mp->ms->sg = WRONGPLANE; + fix_status_for_escape_sequence(mp); + } } } @@ -1327,7 +1341,7 @@ * We firstly take out the first byte of buffered data before we call * this function. This routine parse all rest of buffered data again. */ -static void rebuffering_multi(mp) +static void multi_reparse(mp) MULBUF *mp; { m_position to; @@ -1389,19 +1403,22 @@ lpos = ch_tell(); - while (lpos < pos) { - c = ch_forw_get(); - assert(c != EOI && c != '\n'); - multi_parsing(mp, c); - lpos++; + if (lpos != pos) { + while (lpos < pos) { + c = ch_forw_get(); + assert(c != EOI && c != '\n'); + multi_parse(mp, c, NULL_POSITION, NULL); + lpos++; + } + ch_seek(pos); } - ch_seek(pos); } } #endif -#if 0 -int debug; +#define DEBUG 0 +#if DEBUG +int debug = 1; #endif /* @@ -1411,49 +1428,64 @@ MULBUF *mp; m_position pos; { -#if 0 - if (pos == 1562) - debug = 1; -#endif -/* fprintf(stderr, "%d, %d, %d, %d, %d, %d, %d, %d\n", pos, mp->lastpos, mp->startpos, mp->lastesqpos); */ - if (pos != mp->lastpos + 1) { -#if 0 - fprintf(stderr, "%d, %d, %d, %d, %d, %d, %d, %d\n", pos, mp->lastpos, mp->startpos, mp->lastesqpos); - fprintf(stderr, "oct %o, %o, %o, %o, %o, %o, %o, %o\n", pos, mp->lastpos, mp->startpos, mp->lastesqpos); -#endif - assert(mp->lastpos < mp->startpos); - if (pos <= mp->lastpos && pos > mp->lastesqpos) { - clear_multi(mp); - } else { - init_multi(mp); -#if LESS - multi_find_cs(mp, pos); - clear_multi(mp); -#endif + /* buffer must be empty */ + assert(mp->lastpos < mp->startpos); + + /* initialize m_status if it is necessary */ + if (pos == mp->lastpos + 2 || pos == mp->laststartpos) { + /* + * pos == mp->lastpos+2 if this line is started after \n. + * pos == mp->laststartpos if this line is started by a non-fit + * character. + */ + /* restore backed up sg */ + if (mp->ms->sg != mp->lastsg) { + mp->ms->sg = mp->lastsg; + fix_status_for_escape_sequence(mp); } + /* adjust pointers */ + mp->startpos = pos; + mp->lastpos = pos - 1; } else { - /* Nothing to do */ + /* + * pos == somewhere else if this function is called after jump_loc(). + */ +#if DEBUG + if (debug) { + fprintf(stderr, "%qd, %qd, %qd, %qd\n", pos, mp->lastpos, + mp->startpos, mp->laststartpos); + fprintf(stderr, "oct %qo, %qo, %qo, %qo\n", pos, mp->lastpos, + mp->startpos, mp->laststartpos); + } +#endif + init_multibuf(mp); +#if LESS + multi_find_cs(mp, pos); + clear_multibuf(mp); +#endif + + /* adjust pointers */ + mp->startpos = pos; + mp->lastpos = pos - 1; + mp->laststartpos = pos; } } /* * Buffering characters untile get a guarantee that it is right sequence. */ -void multi_buffering(mp, c, pos, strbuf, csbuf, length, retpos) +void multi_parse(mp, c, pos, mbd) MULBUF* mp; int c; -m_position* pos; -unsigned char** strbuf; -CHARSET** csbuf; -unsigned int* length; -m_position* retpos; +m_position pos; +M_BUFDATA* mbd; { if (c < 0) { - if (retpos != NULL) { - *retpos = mp->startpos; + if (mbd != NULL) { + mbd->pos = mp->startpos; } /* - * Force to flush out buffered characters. + * Force to flush all buffering characters. */ if (mp->eseq != NOESC) { mp->eseq = NOESC; @@ -1461,29 +1493,27 @@ } while (mp->startpos <= mp->lastpos) { wrongcs1(mp); - rebuffering_multi(mp); + multi_reparse(mp); } - *strbuf = mp->multiint; - *csbuf = mp->multics; - *length = mp->intindex; + if (mbd != NULL) { + mbd->cbuf = mp->multiint; + mbd->csbuf = mp->multics; + mbd->byte = mp->intindex; + } mp->intindex = 0; } else { - if (pos != NULL) { - if (*pos != mp->lastpos + 1) { - /* buffer must be empty */ - assert(mp->lastpos < mp->startpos); - /* start buffering */ - mp->startpos = *pos; - } - mp->lastpos = *pos; + if (pos != NULL_POSITION) { + assert(pos == mp->lastpos + 1); + mp->lastpos = pos; } else { mp->lastpos++; } INBUF(mp) = c; - if (retpos != NULL) { - *retpos = mp->startpos; + mp->laststartpos = mp->startpos; + if (mbd != NULL) { + mbd->pos = mp->startpos; } /* @@ -1491,25 +1521,32 @@ */ check_new_buffered_byte(mp); - *strbuf = mp->multiint; - *csbuf = mp->multics; - *length = mp->intindex; + if (mbd != NULL) { + mbd->cbuf = mp->multiint; + mbd->csbuf = mp->multics; + mbd->byte = mp->intindex; + } mp->intindex = 0; } } /* - * Parse and discard characters. This routine is used for chopping line. + * Flush buffered data. */ -void multi_parsing(mp, c) -MULBUF *mp; -int c; +void multi_flush(mp, mbd) +MULBUF* mp; +M_BUFDATA* mbd; { - unsigned char *strbuf; - CHARSET *csbuf; - unsigned int length; + multi_parse(mp, -1, NULL_POSITION, mbd); +} - multi_buffering(mp, c, NULL, &strbuf, &csbuf, &length, NULL); +/* + * Discard buffered data. + */ +void multi_discard(mp) +MULBUF* mp; +{ + multi_parse(mp, -1, NULL_POSITION, NULL); } void set_codesets(mp, left, right) @@ -1517,8 +1554,8 @@ CODESET left; CODESET right; { - mp->io.left = left; - mp->io.right = right; + mp->io.left = left; + mp->io.right = right; } /* Index: less/multi.h diff -u less/multi.h:1.22 less/multi.h:1.25 --- less/multi.h:1.22 Sun Sep 4 13:52:55 2005 +++ less/multi.h Tue Sep 6 07:17:23 2005 @@ -26,6 +26,31 @@ /* + * The design of data structure of jless + * + * We use char[] byte data and CHARSET[] character set data to represent + * multilingual text. We defined CHARSET following ISO 2022 technique. + * All characters represented in ISO 2022 can be stored in less without + * any destructive conversion. + * + * For example, less can read text files using JIS C 6226-1978, JIS X + * 0208-1983, and JIS X 0208:1990 character sets and output everything + * using their original character set while searching a character encoded + * by JIS X 0213:2004. Inside of less, it buffers all text files using + * their original character set, unifies them when matching with the + * searching character, and outputs using their original character sets. + * + * If less needs conversions when it outputs internal data, it converts + * them on the fly. + * + * On the other hand, text using SJIS or UJIS are buffered after + * conversion while less is reading input stream. + * + * In addition, UTF-8 is buffered as UTF-8. Less converts it to appropriate + * character set/sets on the fly. (UTF-8 is notimplemented yet). + */ + +/* * Definition of values to specify the character set. * And definitions some well known character sets and a types of set. */ @@ -108,6 +133,36 @@ #define HEBREW (TYPE_96_CHARSET | FT2CS('H')) #define CYRILLIC (TYPE_96_CHARSET | FT2CS('L')) #define LATIN5 (TYPE_96_CHARSET | FT2CS('M')) +/* + * JISX0208_78KANJI means JIS C 6226-1978 (called JIS X 0208-1978) + * JISX0208KANJI means JIS X 0208-1983 (same as JIS C 6226-1983) + * This is similar to JIS C 6226-1978. Several characters are moved + * or exchanged in code space. Conversion table is available in unify.c. + * JISX0208_90KANJI means JIS X 0208:1990 (same as JIS X 0208-1990) + * This is super set of JIS X 0208-1983. Two characters are added from + * JIS X 0208-1983. In addition, this covers JIS X 0208:1997 too. + * They have the same code space. The difference between them is + * historical description. JIS X 0208:1997 defines ans describes + * all characters. + * JISX0213KANJI1 means JIS X 0213:2000 plane 1 + * This is super set of JIS X 0208:1990 and JIS X 0208:1997. Several + * characters are added. + * JISX02132004KANJI1 means JIS X 0213:2004 plane 1 + * This is super set of JIS X 0213:2000. 10 characters are added. + * And, glyph of several characters is modified. + * + * JISX0212KANJISUP means JIS X 0212:1990 (same as JIS X 0212-1990) + * JISX0213KANJI2 means JIS X 0213:2000 plane 1 + * JISX02132004KANJI2 means JIS X 0213:2004 plane 1 + * + * JISX0201KANA means JIS X 0201:1976 right plane (same as JIS X 0201-1976 + * and JIS C 6220-1976 right plane) + * JISX0201ROMAN means JIS X 0201:1976 left plane (same as JIS X 0201-1976 + * and JIS C 6220-1976 left plane) + * These cover JIS X 0201:1997 too. They have the same code space. + * The difference between them is historical description. + * JIS X 0201:1997 defines ans describes all characters. + */ #define JISX0208_78KANJI (TYPE_94N_CHARSET | FT2CS('@')) #define GB2312 (TYPE_94N_CHARSET | FT2CS('A')) #define JISX0208KANJI (TYPE_94N_CHARSET | FT2CS('B')) @@ -116,14 +171,29 @@ #define JISX0212KANJISUP (TYPE_94N_CHARSET | FT2CS('D')) #define JISX0213KANJI1 (TYPE_94N_CHARSET | FT2CS('O')) #define JISX0213KANJI2 (TYPE_94N_CHARSET | FT2CS('P')) +#define JISX02132004KANJI1 (TYPE_94N_CHARSET | FT2CS('Q')) +#define JISX02132004KANJI2 (TYPE_94N_CHARSET | FT2CS('P')) #if JAPANESE /* * Special number for Japanese code set. Only input_set use following with - * above definitions. The 07/15 is not valid for F. Thus I use it to - * indicate the special character sets. + * above definitions. The 07/15 or 07/14 are not valid for F. So, we are + * using them as indications of special character sets. + * + * SJIS contains ASCII, JIS X 0201:1976 right plane, and JIS X 0208:1997 + * UJIS contains ASCII, JIS X 0201:1976, and JIS X 0208:1997 + * SJIS2000 contains ASCII, JIS X 0201:1976 right plane, and JIS X 0213:2000 + * UJIS2000 contains ASCII, JIS X 0201:1976, JIS X 0213:2000, + * and JIS X 0212:1990 + * SJIS2004 contains ASCII, JIS X 0201:1976 right plane, and JIS X 0213:2004 + * UJIS2004 contains ASCII, JIS X 0201:1976, JIS X 0213:2004, + * and JIS X 0212:1990 */ #define SJIS (IRR2CS(1) | TYPE_94N_CHARSET | FT_MASK) -#define UJIS (IRR2CS(2) | TYPE_94N_CHARSET | FT_MASK) +#define SJIS2000 (IRR2CS(2) | TYPE_94N_CHARSET | FT_MASK) +#define SJIS2004 (IRR2CS(3) | TYPE_94N_CHARSET | FT_MASK) +#define UJIS (IRR2CS(1) | TYPE_94N_CHARSET | (FT_MASK-1)) +#define UJIS2000 (IRR2CS(2) | TYPE_94N_CHARSET | (FT_MASK-1)) +#define UJIS2004 (IRR2CS(3) | TYPE_94N_CHARSET | (FT_MASK-1)) #endif #endif @@ -169,18 +239,32 @@ jis, /* A subset of ISO 2022 */ /* * It may contain JIS C 6226-1978, JIS X 0208-1983, - * JIS X 0208:1990/1997, JIS X 0212:1990, JIS X 0213:2000, - * JIS X 0201:1976/1997 left/right planes, and ASCII as input. + * JIS X 0208:1990/1997, JIS X 0212:1990, + * JIS X 0213:2000/2004, JIS X 0201:1976/1997 left/right + * planes, and ASCII. * - * In the case of output, this means all JIS C 6226-1978, - * JIS X 0208-1983, JIS X 0208:1990/1997, and JIS X 0213:2000 - * are converted into JIS X 0208-1983 encode with an assumption - * that character set of JIS X 0208-1983 encode is - * JIS X 0213:2000. And JIS X 0212:1990 and 2nd plane of - * JIS X 0213:2000 are rejected when output. + * If less is specified to use "jis" as its encoding scheme + * for input stream, less accepts all above character sets. + * e.g. jis-ujis or jis-sjis in JLESSCHARSET. * - * If you need the same code as the output, please use iso7 - * or iso8. + * If less is specified to use "jis" as its encoding scheme + * for output stream, less outputs all characters in + * JIS C 6226-1978 as JIS X 0208-1983 with conversion + * and all other characters in JIS X 0208:1990/1997, + * and JIS X 0213:2000/2004 plane 1 using JIS X 0208-1983 + * (ESC$B) encoding scheme without any conversion. + * Less doesn't convert here with a hope that an output + * device may use JIS X 0213:2004 plane 1 character set + * as its glyph. + * e.g. iso7-jis or ujis-jis in JLESSCHARSET. + * + * In addition, less rejects JIS X 0212:1990 and JIS X + * 0213:2000 plane 2 if "jis" is specified as its encoding + * scheme for output stream. + * e.g. jis or ujis-jis in JLESSCHARSET. + * + * If you need to use JIS X 0213:2004 or any other character + * sets as the output, please use iso7 or iso8. */ iso7, /* A code set which is extented by iso2022 */ /* code sets for only right plane */ @@ -192,6 +276,15 @@ iso8 /* A code set which is extented by iso2022 */ } CODESET; +/* + * A structure used as a return value in multi_parse(). + */ +typedef struct { + char *cbuf; + CHARSET *csbuf; + int byte; + POSITION pos; +} M_BUFDATA; /* * struct multibuf is internal data structure for multi.c. @@ -209,12 +302,13 @@ extern void init_priority (); extern CODESET get_priority (); extern void set_priority (); -extern MULBUF * new_multi (); -extern void clear_multi (); -extern void init_multi (); -extern void multi_start_buffering (); -extern void multi_buffering (); -extern void multi_parsing (); +extern MULBUF * new_multibuf (); +extern void clear_multibuf (); +extern void init_multibuf (); +extern void multi_start (); +extern void multi_parse (); +extern void multi_flush (); +extern void multi_discard (); extern void set_codesets (); extern char * get_icharset_string (); extern char * outchar(); Index: less/search.c diff -u less/search.c:1.62 less/search.c:1.63 --- less/search.c:1.62 Sun Sep 4 15:48:03 2005 +++ less/search.c Sun Sep 4 23:36:22 2005 @@ -272,6 +272,10 @@ char cbuffer[10]; CHARSET csbuffer[10]; int donef = 0; +#if ISO + M_BUFDATA mbd; + MULBUF* mp = get_mulbuf(curr_ifile); +#endif #if ISO if (!(ops & CVT_TO_INT) && srccs == NULL) @@ -283,7 +287,7 @@ return; } - multi_start_buffering(get_mulbuf(curr_ifile), pos); + multi_start_buffering(mp, pos); while ((srccs != NULL && (*src != NULCH || !CSISNULLCS(*srccs))) || (srccs == NULL && !donef)) { @@ -295,17 +299,19 @@ if (*src == '\0') { /* flush buffer */ - multi_buffering(get_mulbuf(curr_ifile), - -1, NULL, &cbuf, &csbuf, - &bufcount, NULL); + multi_flush(mp, &mbd); + cbuf = mbd.cbuf; + csbuf = mbd.csbuf; + bufcount = mbd.byte; donef = 1; } else { /* make charset */ - multi_buffering(get_mulbuf(curr_ifile), - (unsigned char) *src, - &pos, &cbuf, &csbuf, - &bufcount, NULL); + multi_parse(mp, (unsigned char) *src, + pos, &mbd); + cbuf = mbd.cbuf; + csbuf = mbd.csbuf; + bufcount = mbd.byte; } if (bufcount == 0) { Index: less/unify.c diff -u less/unify.c:1.30 less/unify.c:1.31 --- less/unify.c:1.30 Sun Sep 4 17:51:59 2005 +++ less/unify.c Sun Sep 4 23:36:22 2005 @@ -30,7 +30,7 @@ */ #include "defines.h" -#include "multi.h" +#include "less.h" #if ISO Index: less/version.c diff -u less/version.c:1.104 less/version.c:1.107 --- less/version.c:1.104 Sun Sep 4 19:13:38 2005 +++ less/version.c Tue Sep 6 07:18:27 2005 @@ -745,9 +745,12 @@ Fixed JIS X 0213:2000 related problems. Thanks to Takeshi WATANABE. Also, fixed a problem reported by him. Less will not split one wrong multi-byte character into different lines - even it is not fit in first line. Less moves entire text to - represent the character into second line. + even if it is not fit in first line. Less moves entire text + to second line. iso258 9/4/05 Joined with less-382. +iso259 9/6/05 Changed an algorithm to detect the gap of parsing input stream. + This fixed a problem on long JIS/English text. + Fixed '\r' problem. */ -char version[] = "382+iso258"; +char version[] = "382+iso259";