/*******************************************************************************
 *
 * htmltokenizer.c - HTML parsing engine
 *
 * Hash functions for identifying html tags.
 *
 * Cheetah Web Browser
 * Copyright (C) 2001 Garett Spencley 
 * 
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2, or (at your option)
 * any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software Foundation,
 * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 
 *
 *******************************************************************************/

#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#include "htmltokenizer.h"
#include "html.h"
#include "str_util.h"

#define TOTAL_KEYWORDS 92
#define MIN_WORD_LENGTH 1
#define MAX_WORD_LENGTH 10
#define MIN_HASH_VALUE 1
#define MAX_HASH_VALUE 231

/*
 * tag_hash()
 *
 * hashes html tags
 */

__inline static unsigned int tag_hash(const char *str, unsigned int len)
{
	static unsigned char asso_values[] = {
		232, 232, 232, 232, 232, 232, 232, 232, 232, 232,
      	232, 232, 232, 232, 232, 232, 232, 232, 232, 232,
      	232, 232, 232, 232, 232, 232, 232, 232, 232, 232,
      	232, 232, 232, 232, 232, 232, 232, 232, 232, 232,
      	232, 232, 232, 232, 232, 232, 232, 232, 232,  45,
       	40,  50,  20,  25,  30, 232, 232, 232, 232, 232,
      	232, 232, 232, 232, 232,  90,  20,  75,  65,  47,
       	35,  10, 125, 115, 232,  20,   5,   5, 125,   7,
      	100,   0,  35,  40,   0,  55,  10, 232,  20,  35,
      	232, 232, 232, 232, 232, 232, 232, 232, 232, 232,
      	232, 232, 232, 232, 232, 232, 232, 232, 232, 232,
      	232, 232, 232, 232, 232, 232, 232, 232, 232, 232,
      	232, 232, 232, 232, 232, 232, 232, 232, 232, 232,
      	232, 232, 232, 232, 232, 232, 232, 232, 232, 232,
      	232, 232, 232, 232, 232, 232, 232, 232, 232, 232,
      	232, 232, 232, 232, 232, 232, 232, 232, 232, 232,
      	232, 232, 232, 232, 232, 232, 232, 232, 232, 232,
      	232, 232, 232, 232, 232, 232, 232, 232, 232, 232,
      	232, 232, 232, 232, 232, 232, 232, 232, 232, 232,
      	232, 232, 232, 232, 232, 232, 232, 232, 232, 232,
      	232, 232, 232, 232, 232, 232, 232, 232, 232, 232,
      	232, 232, 232, 232, 232, 232, 232, 232, 232, 232,
      	232, 232, 232, 232, 232, 232, 232, 232, 232, 232,
      	232, 232, 232, 232, 232, 232, 232, 232, 232, 232,
      	232, 232, 232, 232, 232, 232, 232, 232, 232, 232,
      	232, 232, 232, 232, 232, 232
	};

	return len + asso_values[(unsigned char) str[len - 1]] + asso_values[(unsigned char) str[0]];
}

/*
 * lookup_tag()
 * 
 * Returns the structure for the tag we are looking for
 */

__inline html_token *lookup_tag(const char *str, unsigned int len)
{
	/* DON'T TOUCH THIS! Use gperf on hash/tags.gperf */
		
	static html_token wordlist[] = {
	  {"Q", QUOTE, html_q},
      {"TT", TT, html_tt},
      {"TFOOT", TFOOT, html_tfoot},
      {"OBJECT", OBJECT, html_object},
      {"OL", OL, html_ol},
      {"LABEL", LABEL, html_label},
      {"BASEFONT", BASEFONT, html_basefont},
      {"LINK", LINK, html_link},
      {"BDO", BDO, html_bdo},
      {"BIG", BIG, html_big},
      {"TR", TR, html_tr},
      {"FONT", FONT, html_font},
      {"TBODY", TBODY, html_tbody},
      {"B", BOLD, html_bold},
      {"FIELDSET", FIELDSET, html_fieldset},
      {"FRAMESET", FRAMESET, html_frameset},
      {"FORM", FORM, html_form},
      {"SCRIPT", SCRIPT, html_script},
      {"SELECT", SELECT, html_select},
      {"VAR", VAR, html_var},
      {"SMALL", SMALL, html_small},
      {"TITLE", TITLE, html_title},
      {"TABLE", TABLE, html_table},
      {"EM", EM, html_em},
      {"STRONG", STRONG, html_strong},
      {"BR", BR, html_br},
      {"BODY", BODY, html_body},
      {"UL", UL, html_ul},
      {"SUB", SUB, html_sub},
      {"MENU", MENU, html_menu},
      {"TD", TD, html_td},
      {"DT", DT, html_dt},
      {"THEAD", THEAD, html_thead},
      {"BASE", BASE, html_base},
      {"DL", DL, html_dl},
      {"DEL", STRIKE, html_strike},
      {"LEGEND", LEGEND, html_legend},
      {"BLOCKQUOTE", BLOCKQUOTE, html_blockquote},
      {"DIV", DIV, html_div},
      {"S", STRIKE, html_strike},
      {"COL", COL, html_col},
      {"FRAME", FRAME, html_frame},
      {"KBD", KBD, html_kbd},
      {"STYLE", STYLE, html_style},
	  {"STRIKE", STRIKE, html_strike},
      {"APPLET", APPLET, html_applet},
      {"TEXTAREA", TEXTAREA, html_textarea},
      {"META", META, html_meta},
      {"ACRONYM", ACRONYM, html_acronym},
      {"DIR", DIRECTORY, html_dir},
      {"MAP", MAP, html_map},
      {"PARAM", PARAM, html_param},
      {"U", UNDERLINE, html_underline},
      {"OPTGROUP", OPTGROUP, html_optgroup},
      {"CENTER", CENTER, html_center},
      {"INPUT", INPUT, html_input},
      {"LI", LI, html_li},
      {"CODE", CODE, html_code},
      {"CITE", CITE, html_cite},
      {"TH", TH, html_th},
      {"IMG", IMG, html_img},
      {"ABBR", ABBR, html_abbr},
      {"DD", DD, html_dd},
      {"NOSCRIPT", NOSCRIPT, html_noscript},
      {"HTML", HTML, html_html},
      {"ADDRESS", ADDRESS, html_address},
      {"OPTION", OPTION, html_option},
      {"ISINDEX", ISINDEX, html_isindex},
      {"SUP", SUP, html_sup},
      {"SAMP", SAMP, html_samp},
      {"H4", H4, html_header},
      {"PRE", PRE, html_pre},
      {"BUTTON", BUTTON, html_button},
      {"H5", H5, html_header},
      {"H6", H6, html_header},
      {"INS", INS, html_ins},
      {"HR", HR, html_hr},
      {"NOBR", NOBR, html_nobr},
      {"H2", H2, html_header},
      {"IFRAME", IFRAME, html_iframe},
      {"SPAN", SPAN, html_span},
      {"H1", H1, html_header},
      {"NOFRAMES", NOFRAMES, html_noframes},
      {"H3", H3, html_header},
      {"A", A, html_a},
      {"COLGROUP", COLGROUP, html_colgroup},
      {"AREA", AREA, html_area},
      {"DFN", DFN, html_dfn},
      {"HEAD", HEAD, html_head},
      {"P", PARA, html_para},
      {"CAPTION", CAPTION, html_caption},
      {"I", ITALIC, html_italic}
	};

	static short lookup[] = {
		-1,    0,    1,   -1,   -1,    2,   -1,   -1,
        -1,   -1,   -1,   -1,   -1,    3,    4,    5,
        -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
        -1,   -1,   -1,   -1,    6,    7,    8,   -1,
        -1,    9,   -1,   -1,   -1,   10,   -1,   11,
        12,   13,   -1, -167,   16,   -1, -158,   -1,
        19,   -1,   20,   -1, -153,   -1,   23,   -1,
        24,   25,   -1,   26,  -71,   -2,   27,   28,
        29,  -75,   -2, -161,  -62,   -2,   32,   33,
        34,   35,  -78,   -2,   36,   37,   38,   -1,
        -1,   39,   -1,   40,   -1,   -1,   -1,   41,
        42,   -1,   -1,   -1,   43,   44,   -1,   -1,
        45,   -1,   46,   47,   -1,   -1,   48,   49,
        -1,   -1,   -1,   -1,   50,   -1,   51,   52,
        -1,   -1,   -1,   53,   54,   -1,   -1,   -1,
        55,   -1,   56,   -1,   -1,   -1, -223,   59,
        60,   61,  -35,   -2,   62,   63,   64,   -1,
        -1,   65,   66,   -1,   -1,   -1,   67,   68,
        69,   -1,   -1,   70,   -1,   -1,   71,   72,
        73,   -1,   -1,   -1,   -1,   74,   75,   -1,
        -1,   -1,   76,   -1,   77,   -1,   -1,   78,
        79,   80,   -1,   -1,   81,   82,   -1,   -1,
        -1,   83,   -1,   -1,   -1,   84,   -1,   85,
        86,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
        -1,   87,   88,   -1,   -1,   -1,   -1,   -1,
        -1,   89,   -1,   -1,   -1,   -1,   -1,   90,
        -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
        -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
        -1,   -1,   -1,   -1,   -1,   -1,   -1,   91
	};

	if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH) {

		register int key = tag_hash(str, len);

		if (key <= MAX_HASH_VALUE && key >= 0) {

			register int index = lookup[key];

			if (index >= 0) {

				register const char *s = wordlist[index].tag;

				if ((*str == *s) && (my_strcmp(str + 1, s + 1) == 0))
					return &wordlist[index];

			} else if (index < -TOTAL_KEYWORDS) {

				register int offset = -1 - TOTAL_KEYWORDS - index;
				register html_token *wordptr = &wordlist[TOTAL_KEYWORDS + lookup[offset]];
				register html_token *wordendptr = wordptr + -lookup[offset + 1];

				while (wordptr < wordendptr) {

					register const char *s = wordptr->tag;

					if ((*str == *s) && (my_strcmp(str + 1, s + 1) == 0))
						return wordptr;

					wordptr++;
				}
			}
		}
	}

	return NULL;
}

/*
 * get_tag()
 *
 * returns the tag id for the tag string 'tag'
 */

html_token *get_tag(char *tag)
{
	html_token *token;
	char buf[strlen(tag)+1];
	char *string = buf;

	if (!tag)
		return NULL;

	while(*tag) {
		*string = toupper(*tag);
		++tag;
		++string;
	}
	*string = 0;

	token = lookup_tag(buf, strlen(buf));
	if (!token)
		return NULL;

	return token;
}
