/* 
  mxTidy -- Interface to HTML Tidy (HTML/XML beautifier)

  Copyright (c) 2001-2002, eGenix.com Software GmbH; mailto:info@egenix.com
  See the documentation for further copyright information or contact
  the author (mailto:mal@lemburg.com).

*/

/* Debug defines: */
/*#define MAL_MEM_DEBUG*/
/*#define MAL_DEBUG*/
/*#define MAL_REF_DEBUG*/

/* Logging file used by debugging facility */
#ifndef MAL_DEBUG_OUTPUTFILE
# define MAL_DEBUG_OUTPUTFILE "mxTidy.log"
#endif

/* We want all our symbols to be exported */
#define MX_BUILDING_MXTIDY

#include "mx.h"
#include "mxTidy.h"

/* Include symbols from Tidy */
#include "htmltidy.h"

/* Version number: Major.Minor.Patchlevel */
#define VERSION "0.3.0"

/* --- module doc-string -------------------------------------------------- */

static char *Module_docstring = 

 MXTIDY_MODULE" -- Interface to Tidy (HTML beautifier).\n\n"

 "Version "VERSION"\n\n"

 "Copyright (c) 2001-2002, eGenix.com Software GmbH; mailto:info@egenix.com\n\n"
 "                 All Rights Reserved\n\n"
 "See the documentation for further information on copyrights,\n"
 "or contact the author."
;

/* --- module globals ----------------------------------------------------- */

static PyObject *mxTidy_Error;		/* Error Exception object */

/* Flag telling us whether the module was initialized or not. */
static int mxTidy_Initialized = 0;

/* --- forward declarations ----------------------------------------------- */

/* --- internal macros ---------------------------------------------------- */

/* --- module helpers ----------------------------------------------------- */

/* Create an exception object, insert it into the module dictionary
   under the given name and return the object pointer; this is NULL in
   case an error occurred. base can be given to indicate the base
   object to be used by the exception object. It should be NULL
   otherwise */

static 
PyObject *insexc(PyObject *moddict,
		 char *name,
		 PyObject *base)
{
    PyObject *v;
    char fullname[256];
    char *modname;
    char *dot;
    
    v = PyDict_GetItemString(moddict, "__name__");
    if (v == NULL)
	modname = NULL;
    else
	modname = PyString_AsString(v);
    if (modname == NULL) {
	PyErr_Clear();
	modname = MXTIDY_MODULE;
    }
    /* The symbols from this extension are imported into
       mx.<packagename>. We trim the name to not confuse the user with
       an overly long package path. */
    strcpy(fullname, modname);
    dot = strchr(fullname, '.');
    if (dot)
	dot = strchr(dot+1, '.');
    if (dot)
	strcpy(dot+1, name);
    else
	sprintf(fullname, "%s.%s", modname, name);


    v = PyErr_NewException(fullname, base, NULL);
    if (v == NULL)
	return NULL;
    if (PyDict_SetItemString(moddict,name,v))
	return NULL;
    return v;
}

#if 0
/* Helper for adding integer constants. Check for errors with
   PyErr_Occurred() */
static 
void insint(PyObject *dict,
	    char *name,
	    int value)
{
    PyObject *v = PyInt_FromLong((long)value);
    PyDict_SetItemString(dict, name, v);
    Py_XDECREF(v);
}
#endif

#ifdef Py_NEWSTYLENUMBER
static
PyObject *notimplemented1(PyObject *v)
{
    Py_Error(PyExc_TypeError,
	     "operation not implemented");
 onError:
    return NULL;
}

static
PyObject *notimplemented2(PyObject *v, PyObject *w)
{
    Py_Error(PyExc_TypeError,
	     "operation not implemented");
 onError:
    return NULL;
}

static
PyObject *notimplemented3(PyObject *u, PyObject *v, PyObject *w)
{
    Py_Error(PyExc_TypeError,
	     "operation not implemented");
 onError:
    return NULL;
}
#endif

/* --- Tidy configuration helpers -------------------------------------- */

#define SET_BOOL_OPTION(name, variable, truevalue, falsevalue)		\
	if ((value = PyDict_GetItemString(options, name)) != NULL)	\
	    variable = PyObject_IsTrue(value) ? truevalue : falsevalue;

#define SET_INT_OPTION(name, variable) 					    \
	if ((value = PyDict_GetItemString(options, name)) != NULL) {	    \
	    variable = PyInt_AsLong(value);				    \
	    Py_Assert(variable >= 0,					    \
		      PyExc_TypeError,					    \
		      "'"name"' option value must be an integer and >= 0"); \
	}

#define SET_STRING_OPTION(name, variable)				\
	if ((value = PyDict_GetItemString(options, name)) != NULL) {	\
	    Py_Assert(PyString_Check(value),				\
		      PyExc_TypeError,					\
		      "'"name"' option value must be a string");	\
	    variable = wstrdup(PyString_AS_STRING(value));		\
	}

static
int mxTidy_SetIndent(tidyconfig *config, char *value) 
{
    if (wstrcasecmp(value, "yes") == 0)
    {
        config->IndentContent = yes;
        config->SmartIndent = no;
    }
    else if (wstrcasecmp(value, "true") == 0)
    {
        config->IndentContent = yes;
        config->SmartIndent = no;
    }
    else if (wstrcasecmp(value, "no") == 0)
    {
        config->IndentContent = no;
        config->SmartIndent = no;
    }
    else if (wstrcasecmp(value, "false") == 0)
    {
        config->IndentContent = no;
        config->SmartIndent = no;
    }
    else if (wstrcasecmp(value, "auto") == 0)
    {
        config->IndentContent = yes;
        config->SmartIndent = yes;
    }
    else
	Py_Error(PyExc_ValueError,
		 "unknown 'indent' value");
    return 0;

 onError:
    return -1;
}

static
int mxTidy_SetEncoding(tidyconfig *config, char *value) 
{
    if (wstrcasecmp(value, "ascii") == 0)
        config->CharEncoding = ASCII;
    else if (wstrcasecmp(value, "latin1") == 0)
        config->CharEncoding = LATIN1;
    else if (wstrcasecmp(value, "raw") == 0)
        config->CharEncoding = RAW;
    else if (wstrcasecmp(value, "utf8") == 0)
        config->CharEncoding = UTF8;
    else if (wstrcasecmp(value, "iso2022") == 0)
        config->CharEncoding = ISO2022;
    else if (wstrcasecmp(value, "mac") == 0)
        config->CharEncoding = MACROMAN;
    else
	Py_Error(PyExc_ValueError,
		 "unknown 'char-encoding' value");
    return 0;

 onError:
    return -1;
}

/* --- Tidy Interface ------------------------------------------------- */

static
int mxTidy_Initialize(void) 
{
    /* Initialize Tidy */
    InitTidy();

    return 0;
}

static
int mxTidy_Cleanup(void) 
{
    /* Called to free hash tables etc. */
    DeInitTidy();

    return 0;
}

static
PyObject *mxTidy_RunTidy(PyObject *inputstream,
			 PyObject *outputstream,
			 PyObject *errorstream,
			 PyObject *options)
{
    char *inputstreamname;
    tidyconfig *config = NULL;
    Node *document = NULL, *doctype;
    Lexer *lexer = NULL;
    InputStream *input = NULL;
    OutputStream *output = NULL;
    OutputStream *errout = NULL;
    PyObject *value, *result;
    int totalwarnings;
    int totalerrors;

    DPRINTF("starting tidy()...\n");

    /* Check Python arguments */
    if (options != NULL) {
	Py_Assert(PyDict_Check(options),
		  PyExc_TypeError,
		  "options must be given as dictionary");
	if (PyDict_Size(options) == 0)
	    options = NULL;
    }

    /* Init Tidy globals */
    config = NewConfig();
    totalwarnings = 0;
    totalerrors = 0;

    DPRINTF("parsing options...\n");

    /* Parse Tidy options.

       These are the union of the command line options from tidy.c and
       the config file options from config.c.

       Note: All hyphens have to be converted to underscores to make
       the option names compatible to Python identifiers.

       Helpers:

	   RE: ^[^"]+\(".+"\), *{.*&\(.+\)}, +ParseBool},
	   ->  SET_BOOL_OPTION(\1, \2, yes);

    */

    if (options != NULL) {

	SET_BOOL_OPTION("add_xml_decl", config->XmlPi, yes, no);
	SET_BOOL_OPTION("add_xml_pi", config->XmlPi, yes, no);
	SET_BOOL_OPTION("add_xml_space", config->XmlSpace, yes, no);
	SET_BOOL_OPTION("assume_xml_procins", config->XmlPIs, yes, no);
	SET_BOOL_OPTION("break_before_br", config->BreakBeforeBR, yes, no);
	SET_BOOL_OPTION("clean", config->MakeClean, yes, no);
	SET_BOOL_OPTION("drop_empty_paras", config->DropEmptyParas, yes, no);
	SET_BOOL_OPTION("drop_font_tags", config->DropFontTags, yes, no);
	SET_BOOL_OPTION("enclose_block_text", config->EncloseBlockText, yes, no);
	SET_BOOL_OPTION("fix_backslash", config->FixBackslash, yes, no);
	SET_BOOL_OPTION("fix_bad_comments", config->FixComments, yes, no);
	SET_BOOL_OPTION("gnu_emacs", config->Emacs, yes, no);
	SET_BOOL_OPTION("hide_endtags", config->HideEndTags, yes, no);
	SET_BOOL_OPTION("indent_attributes", config->IndentAttributes, yes, no);
	SET_BOOL_OPTION("input_xml", config->XmlTags, yes, no);
	SET_BOOL_OPTION("literal_attributes", config->LiteralAttribs, yes, no);
	SET_BOOL_OPTION("logical_emphasis", config->LogicalEmphasis, yes, no);
	SET_BOOL_OPTION("numeric_entities", config->NumEntities, yes, no);
	SET_BOOL_OPTION("output_errors", config->Errors, yes, no);
	SET_BOOL_OPTION("output_markup", config->Output, yes, no);
	SET_BOOL_OPTION("output_xhtml", config->xHTML, yes, no);
	SET_BOOL_OPTION("output_xml", config->XmlOut, yes, no);
	SET_BOOL_OPTION("quiet", config->Quiet, yes, no);
	SET_BOOL_OPTION("quote_ampersand", config->QuoteAmpersand, yes, no);
	SET_BOOL_OPTION("quote_marks", config->QuoteMarks, yes, no);
	SET_BOOL_OPTION("quote_nbsp", config->QuoteNbsp, yes, no);
	SET_BOOL_OPTION("raw", config->RawOut, yes, no);
	SET_BOOL_OPTION("show_warnings", config->ShowWarnings, yes, no);
	SET_BOOL_OPTION("tidy_mark", config->TidyMark, yes, no);
	SET_BOOL_OPTION("uppercase_attributes", config->UpperCaseAttrs, yes, no);
	SET_BOOL_OPTION("uppercase_tags", config->UpperCaseTags, yes, no);
	SET_BOOL_OPTION("word_2000", config->Word2000, yes, no);
	SET_BOOL_OPTION("wrap_asp", config->WrapAsp, yes, no);
	SET_BOOL_OPTION("wrap_attributes", config->WrapAttVals, yes, no);
	SET_BOOL_OPTION("wrap_jste", config->WrapJste, yes, no);
	SET_BOOL_OPTION("wrap_php", config->WrapPhp, yes, no);
	SET_BOOL_OPTION("wrap_script_literals", config->WrapScriptlets, yes, no);
	SET_BOOL_OPTION("wrap_sections", config->WrapSection, yes, no);
	SET_INT_OPTION("indent_spaces", config->spaces);
	SET_INT_OPTION("tab_size", config->tabsize);
	SET_INT_OPTION("wrap", config->wraplen);
	SET_STRING_OPTION("alt_text", config->alt_text);

	/* Indentation */
	if ((value = PyDict_GetItemString(options, "indent")) != NULL) {
	    Py_Assert(PyString_Check(value),
		      PyExc_TypeError,
		      "'indent' option value must be a string");
	    if (mxTidy_SetIndent(config, PyString_AS_STRING(value)))
		goto onError;
	}

	/* Character encoding */
	if ((value = PyDict_GetItemString(options, "char_encoding")) != NULL) {
	    Py_Assert(PyString_Check(value),
		      PyExc_TypeError,
		      "'char_encoding' option value must be a string");
	    if (mxTidy_SetEncoding(config, PyString_AS_STRING(value)))
		goto onError;
	}

	/* Not yet supported... use a config file to set these */
#if 0
    {"new_inline_tags",     {(int *)&inline_tags},  ParseTagNames},
    {"new_blocklevel_tags", {(int *)&block_tags},   ParseTagNames},
    {"new_empty_tags",  {(int *)&empty_tags},       ParseTagNames},
    {"new_pre_tags",    {(int *)&pre_tags},         ParseTagNames},

    {"doctype",         {(int *)&doctype_str},      ParseDocType},
#endif

    }

    /* XXX Music:

       ETAGE 3 - hotel costes; mixed by Stephane Pompougnac

     */
    
    /* ensure config is self-consistent */
    AdjustConfig(config);

    /* Setup streams */
    Py_Assert(inputstream != NULL,
	      PyExc_TypeError,
	      "missing inputstream");
    if (PyFile_Check(inputstream)) {
	input = InputStreamFromFile(PyFile_AsFile(inputstream),
				    config->CharEncoding,
				    config->tabsize);
	inputstreamname = PyString_AS_STRING(PyFile_Name(inputstream));
    }
    else if (PyString_Check(inputstream)) {
	input = InputStreamFromBuffer(PyString_AS_STRING(inputstream),
				      PyString_GET_SIZE(inputstream),
				      0,
				      config->CharEncoding,
				      config->tabsize);
	inputstreamname = "<string>";
    }
    else 
	Py_Error(PyExc_TypeError,
		 "inputstream must be a file object or string");

    if (config->Output == no)
	output = NULL;
    else if (outputstream == NULL)
	output = OutputStreamFromBuffer(NULL,
					0,
					0,
					config->CharEncoding);
    else if (PyFile_Check(outputstream))
	output = OutputStreamFromFile(PyFile_AsFile(outputstream),
				      config->CharEncoding);
    else	       
	Py_Error(PyExc_TypeError,
		 "outputstream must be a file object or None");

    if (config->Errors == no)
	errout = NULL;
    else if (errorstream == NULL)
	errout = OutputStreamFromBuffer(NULL,
					0,
					0,
					config->CharEncoding);
    else if (PyFile_Check(errorstream))
	errout = OutputStreamFromFile(PyFile_AsFile(errorstream),
				      config->CharEncoding);
    else Py_Error(PyExc_TypeError,
		  "errorstream must be a file object or None");

    DPRINTF("running lexer...\n");

    /* Initialize Lexer */
    lexer = NewLexer(config, input, errout);

    DPRINTF("running parser...\n");

    /* Tidy doesn't alter the doctype for generic XML docs */
    if (config->XmlTags)
	document = ParseXMLDocument(lexer);
    else {
	lexer->warnings = 0;

	document = ParseDocument(lexer);

	Py_Assert(CheckNodeIntegrity(document),
		  mxTidy_Error,
		  "Tidy tree has lost its integrity");

	/* Simplifies <b><b> ... </b> ...</b> etc. */
	NestedEmphasis(lexer, document);

	/* Cleans up <dir>indented text</dir> etc. */
	List2BQ(lexer, document);
	BQ2Div(lexer, document);

	/* Replaces <i> by <em> and <b> by <strong> */
	if (config->LogicalEmphasis)
	    EmFromI(lexer, document);

	/* Fix Word generated HTML files */
	if (config->Word2000 && IsWord2000(lexer, document)) {

	    /* prune Word2000's <![if ...]> ... <![endif]> */
	    DropSections(lexer, document);

	    /* drop style & class attributes and empty p, span elements */
	    CleanWord2000(lexer, document);
	}

	/* Replaces presentational markup by style rules */
	if (config->MakeClean || config->DropFontTags)
	    CleanTree(lexer, document);

	Py_Assert(CheckNodeIntegrity(document),
		  mxTidy_Error,
		  "Tidy tree has lost its integrity");

	doctype = FindDocType(document);

	if (document->content) {
	    if (config->xHTML)
		SetXHTMLDocType(lexer, document);
	    else
		FixDocType(lexer, document);

	    if (config->TidyMark)
		AddGenerator(lexer, document);
	}

	/* Ensure presence of initial <?XML version="1.0"?> */
	if (config->XmlOut && config->XmlPi)
	    FixXMLPI(lexer, document);

	totalwarnings += lexer->warnings;
	totalerrors += lexer->errors;

	/* XXX Report errors & warnings... */
#if 0
	if (!config->Quiet && document->content) {
	    ReportVersion(errout, lexer, inputstreamname, doctype);
	    ReportNumWarnings(errout, lexer);
	}
#endif
    }

    /* XXX Report errors... */
    if (lexer->errors > 0)
	NeedsAuthorIntervention(errout);

    DPRINTF("generating output...\n");

    /* Generate output */
    if (output != NULL && 
	config->Output && 
	lexer->errors == 0) {

	if (config->XmlTags)
	    PPrintXMLTree(lexer, output, 0, 0, document);
	else
	    PPrintTree(lexer, output, 0, 0, document);
    }

    /* XXX Report errors & warnings... */
#if 0
    ErrorSummary(lexer);
    if (totalerrors + totalwarnings > 0)
        GeneralInfo(errout);
#endif

    /* Build result tuple */
    result = Py_BuildValue("iiz#z#", 
			   totalerrors, totalwarnings,
			   output ? output->data : NULL, 
			   output ? output->datapos : 0,
			   errout ? errout->data : NULL, 
			   errout ? errout->datapos : 0);
    
    /* Free data structures */
    FreeNode(document);
    document = NULL;
    FreeLexer(lexer);
    lexer = NULL;
    FreeConfig(config);
    config = NULL;
    
    /* Free streams */
    FreeInputStream(input);
    input = NULL;
    if (output) {
	FreeOutputStream(output);
	output = NULL;
    }
    if (errout) {
	FreeOutputStream(errout);
	errout = NULL;
    }
    
    /* Return status */
    return result;

 onError:
    if (document)
	FreeNode(document);
    if (lexer)
	FreeLexer(lexer);
    if (config)
	FreeConfig(config);
    if (input)
	FreeInputStream(input);
    if (output)
	FreeOutputStream(output);
    if (errout)
	FreeOutputStream(errout);
    return NULL;
}

/* --- Module Functions --------------------------------------------- */

Py_C_Function(mxTidy_tidy,
	      "tidy(input, [output, errors=None, options={}])\n\n"
	      "Filter input through Tidy and write to output.\n"
	      "Tidy options must be given in the options dictionary."
	      )
{
    PyObject *input, *output=NULL, *errors=Py_None, *options=NULL;

    Py_Get4Args("O|OOO", input, output, errors, options);
    if (output == Py_None)
	output = NULL;
    if (errors == Py_None)
	errors = NULL;
    if (options == Py_None)
	options = NULL;
    return mxTidy_RunTidy(input, output, errors, options);

 onError:
    return NULL;
}

/* XXX Add API to query Tidy version */

#if 0
Py_C_Function( mxTidy_urljoin,
	       "urljoin(u,v)\n\n"
	       "Takes two Tidys or strings, joins them and returns the\n"
	       "result as Tidy object")
{
    mxTidyObject *a = 0,*b = 0;
    PyObject *u,*v;
    PyObject *url;

    Py_Get2Args("OO",u,v);

    if (_mxTidy_Check(u)) {
	a = (mxTidyObject *)u;
	Py_INCREF(u);
    }
    else if (PyString_Check(u)) {
	a = mxTidy_FromString(PyString_AS_STRING(u),
			     RAW_Tidy);
	if (!a)
	    goto onError;
    }
    else
	Py_Error(PyExc_TypeError,
		 "arguments must be Tidys or strings");

    if (_mxTidy_Check(v)) {
	b = (mxTidyObject *)v;
	Py_INCREF(v);
    }
    else if (PyString_Check(v)) {
	b = mxTidy_FromString(PyString_AS_STRING(v),
			     RAW_Tidy);
	if (!b)
	    goto onError;
    }
    else
	Py_Error(PyExc_TypeError,
		 "arguments must be Tidys or strings");

    url = (PyObject *)mxTidy_FromJoiningTidys((mxTidyObject*)a,
					    (mxTidyObject*)b);
    if (!url)
	goto onError;

    DPRINTF(" urljoin() returning '%s'\n",
	    PyString_AS_STRING(((mxTidyObject *)url)->url));
    Py_DECREF(a);
    Py_DECREF(b);
    Py_PRINT_REFCOUNT(url);
    return url;

 onError:
    Py_XDECREF(a);
    Py_XDECREF(b);
    return NULL;
}

#endif

/* --- module init --------------------------------------------------------- */

/* Python Method Table */

static 
PyMethodDef Module_methods[] =
{   
    Py_MethodListEntry("tidy",mxTidy_tidy),
#if 0
    Py_MethodListEntrySingleArg("setmimedict",mxTidy_setmimedict),
#endif
    {NULL,NULL} /* end of list */
};

/* Cleanup function */
static 
void mxTidyModule_Cleanup(void)
{
    if (mxTidy_Cleanup())
	PyErr_Clear();

    /* Reset mxTidy_Initialized flag */
    mxTidy_Initialized = 0;
}

/* create PyMethodObjects and register them in the module's dict */
MX_EXPORT(void) 
     initmxTidy(void)
{
    PyObject *module, *moddict;

    if (mxTidy_Initialized)
	Py_Error(PyExc_SystemError,
		 "can't initialize "MXTIDY_MODULE" more than once");

    /* Create module */
    module = Py_InitModule4(MXTIDY_MODULE, /* Module name */
			    Module_methods, /* Method list */
			    Module_docstring, /* Module doc-string */
			    (PyObject *)NULL, /* always pass this as *self */
			    PYTHON_API_VERSION); /* API Version */
    if (module == NULL)
	goto onError;

    /* Init Tidy */
    if (mxTidy_Initialize())
	goto onError;

    /* Add some constants to the module's dict */
    moddict = PyModule_GetDict(module);
    PyDict_SetItemString(moddict, 
			 "__version__",
			 PyString_FromString(VERSION));

    /* Errors */
    if (!(mxTidy_Error = insexc(moddict, "Error", PyExc_StandardError)))
	goto onError;

#if 0
    /* Type objects */
    Py_INCREF(&mxTidy_Type);
    PyDict_SetItemString(moddict, "TidyType",
			 (PyObject *)&mxTidy_Type);
#endif

    /* Register cleanup function */
    if (Py_AtExit(mxTidyModule_Cleanup)) {
	/* XXX what to do if we can't register that function ??? */
	DPRINTF("* Failed to register mxTidy cleanup function\n");
    }

    /* We are now initialized */
    mxTidy_Initialized = 1;

 onError:
    /* Check for errors and report them */
    if (PyErr_Occurred())
	Py_ReportModuleInitError(MXTIDY_MODULE);
    return;
}
