#include "htmlise.h"
struct table_layout {
/* ruleline is the line number of the table rule in the first paragraph of
* the table; ruleindent the indent of the table rule and width the line
* length of the table rule. ntablecolumns is the number of columns in the
* table. */
size_t ruleline, ruleindent, width, ntablecolumns;
/* is_gap consists of width flags each of which is true if the
* corresponding column must be whitespace in each line of the table. */
char *is_gap;
/* col and ncols store the starting column positions and character widths
* of the ntablecolumns individual table columns. */
size_t *col, *ncols;
};
/* paragraph_is_start_of_table PARAGRAPH LAYOUT
* If PARAGRAPH is a possible start-of-table, fill in LAYOUT with the layout of
* the table and return 1. Otherwise, return 0. The table rule is set to the
* first possible candidate rule. On successful return, LAYOUT->is_gap, col and
* ncols are allocated on the heap and must be freed by the caller. */
static int paragraph_is_start_of_table(const struct paragraph *P, struct table_layout *L) {
size_t i, j, minindent = 1000000, maxlen = 0, *candidates, ncandidates = 0;
char *is_gap = NULL;
int ret = 0;
candidates = malloc(P->nlines * sizeof *candidates);
/* Look at each line in the paragraph and find the minimum indent and
* maximum length of the lines. Also identify any candidate table rules. */
for (i = 0; i < P->nlines; ++i) {
size_t indent, len, ngaps = 0;
indent = strspn(P->lines[i], " ");
len = strlen(P->lines[i]);
/* Find horizontal extent of paragraph. */
if (indent < minindent)
minindent = indent;
if (len > maxlen) {
is_gap = realloc(is_gap, len);
for (j = maxlen; j < len; ++j)
is_gap[j] = P->lines[i][j] == ' ' ? 1 : 0;
maxlen = len;
}
/* Find gaps. */
for (j = 0; j < len; ++j)
if (P->lines[i][j] != ' ')
is_gap[j] = 0;
else
++ngaps;
/* Is this line a candidate table rule? It can't be the first line of
* the paragraph, obviously. */
if (i > 0 && strspn(P->lines[i], "- ") == len && ngaps > 0)
candidates[ncandidates++] = i;
}
/* Now find any candidate table rule which satisfies the constraints. */
for (i = 0; i < ncandidates; ++i) {
size_t len;
char *line;
line = P->lines[candidates[i]];
len = strlen(line);
for (j = 0; j < len; ++j)
if (line[j] == ' ' && !is_gap[j])
break;
if (j == len) {
int f;
size_t j, n;
struct table_layout Lz = { 0 };
*L = Lz;
/* Choose this one. */
ret = 1;
L->is_gap = is_gap;
is_gap = NULL; /* Don't free it. */
L->ruleline = candidates[i];
L->ruleindent = minindent;
L->width = maxlen;
/* Figure out how many columns we have and where in the line
* they are. */
for (j = 0, f = 0, L->ntablecolumns = 0; j < L->width; ++j) {
if (!L->is_gap[j] && !f) {
f = 1;
++L->ntablecolumns;
} else if (L->is_gap[j])
f = 0;
}
L->col = malloc(L->ntablecolumns * sizeof *L->col);
L->ncols = malloc(L->ntablecolumns * sizeof *L->ncols);
for (j = 0, n = 0; j < L->width; ++j) {
if (!L->is_gap[j] && (j == 0 || L->is_gap[j - 1]))
L->col[n] = j;
if (L->is_gap[j] && j > 0 && !L->is_gap[j - 1]) {
L->ncols[n] = j - L->col[n];
++n;
}
}
if (n < L->ntablecolumns) {
L->ncols[n] = j - L->col[n] + 1;
++n;
}
assert(n == L->ntablecolumns);
break;
}
}
if (candidates)
free(candidates);
if (is_gap)
free(is_gap);
return ret;
}
/* paragraph_is_part_of_table PARAGRAPH LAYOUT FIRST
* If PARAGRAPH fits the given table LAYOUT startbing with FIRST, return 1.
* Otherwise, return 0. We permit a table to be the contents of a bullet, so if
* PARAGRAPH is has the same or smaller indent as FIRST and has a leader, it
* cannot be part of the table. */
static int paragraph_is_part_of_table(const struct paragraph *P, const struct table_layout *L, const struct paragraph *first) {
size_t i;
for (i = 0; i < P->nlines; ++i) {
char *line;
size_t j, len;
line = P->lines[i];
len = strlen(line);
if (len > L->width)
return 0;
for (j = 0; j < len; ++j)
if (L->is_gap[j] && line[j] != ' ')
return 0;
}
if (P->type != none && P->indent <= first->indent)
return 0;
return 1;
}
/* extract_paragraphs PARAGRAPH STARTLINE NLINES STARTCOL NCOLS
* Process the rectangle defined by STARTLINE, NLINES and STARTCOL, NCOLS
* in the text of PARAGRAPH, breaking it into individual paragraphs and
* returning them as a linked list. If no paragraphs can be extracted, return
* NULL. */
static struct paragraph *extract_paragraphs(const struct paragraph *para, const size_t startline, const size_t nlines, const size_t startcol, const size_t ncols) {
size_t n;
struct paragraph *ret = NULL, *last = NULL, *cur = NULL;
char *buf;
buf = malloc(ncols + 1);
/* Walk through the lines in para, excising the bit of text defined by the
* startcol and ncols, and add a copy of same to the current paragraph. */
for (n = startline; n < startline + nlines; ++n) {
size_t len;
memset(buf, 0, ncols + 1);
len = strlen(para->lines[n]);
if (len > startcol) {
size_t m;
m = ncols;
if (m > len - startcol)
m = len - startcol;
memcpy(buf, para->lines[n] + startcol, m);
/* Strip trailing whitespace. */
while (m > 0) {
if (buf[m - 1] == ' ')
buf[m - 1] = 0;
else break;
--m;
}
}
/* Line ends paragraph. Classify and save it. */
if (!*buf && cur) {
classify_paragraph(cur);
if (last) {
last->next = cur;
cur->prev = last;
} else
last = cur;
if (!ret)
ret = last;
cur = NULL;
}
if (*buf) {
if (!cur) {
/* Line starts new paragraph. */
alloc_struct(paragraph, cur);
cur->lines = malloc((para->nlines - n) * sizeof *cur->lines);
}
cur->lines[cur->nlines++] = strdup(buf);
}
}
if (cur) {
classify_paragraph(cur);
if (last) {
last->next = cur;
cur->prev = last;
} else
last = cur;
if (!ret)
ret = last;
}
free(buf);
/* Recursively process these paragraphs. */
process_tables(ret);
return ret;
}
/* process_table_rows FIRST LAST LAYOUT
* Consume paragraphs from FIRST to LAST inclusive which are part of the given
* LAYOUT, replacing each paragraph with one or more rows of table cells. */
static struct paragraph *process_table_rows(struct paragraph *first, struct paragraph *last, const struct table_layout *L) {
struct paragraph *P, *ret = NULL;
int is_first_para = 1, only_one_para;
size_t i;
/*
* Each paragraph except the first represents a single table row, so we
* can replace paragraphs with row containers in-place.
*/
only_one_para = (first == last);
P = first;
while (1) {
struct paragraph *cells = NULL, *l = NULL;
size_t startline = 0;
if (is_first_para) {
/*
* The first row of the table is a special case, because of the
* rule line, above which is the header and below which may be cell
* data.
*/
struct paragraph *newpara;
for (i = 0; i < L->ntablecolumns; ++i) {
/* Add a | to the row. */
struct paragraph *p_th;
alloc_struct(paragraph, p_th);
p_th->container = "th";
p_th->contents = extract_paragraphs(P, 0, L->ruleline, L->col[i], L->ncols[i]);
p_th->prev = l;
if (l) {
l->next = p_th;
l = l->next;
} else {
l = cells = p_th;
}
}
/* Because we don't want to move first or last, we insert a new
* paragraph after first and replace first with this row. */
alloc_struct(paragraph, newpara);
*newpara = *first;
newpara->prev = first;
first->next = newpara;
if (newpara->next)
newpara->next->prev = newpara;
first->container = "tr";
first->contents = cells;
first->nlines = 0;
first->lines = NULL;
P = newpara;
/* Bits under rule line are normal cells. */
startline = L->ruleline + 1;
is_first_para = 0;
}
if (startline < P->nlines) {
cells = l = NULL;
for (i = 0; i < L->ntablecolumns; ++i) {
/* Add a | to the row. */
struct paragraph *p_td;
alloc_struct(paragraph, p_td);
p_td->container = "td";
p_td->contents = extract_paragraphs(P, startline, P->nlines - startline, L->col[i], L->ncols[i]);
p_td->prev = l;
if (l) {
l->next = p_td;
l = l->next;
} else {
l = cells = p_td;
}
}
/* Replace this paragraph's contents. */
P->container = "tr";
P->contents = cells;
for (i = 0; i < P->nlines; ++i)
free(P->lines[i]);
free(P->lines);
P->nlines = 0;
}
if (only_one_para || P == last)
break;
P = P->next;
}
return ret;
}
/* process_tables PARAGRAPHS
* Go through the list of PARAGRAPHS, identifying extents which form part of
* a table and splitting them up into individual cells. Each cell is processed
* recursively by process_tables, so that tables-within-tables may be
* implemented (god help us). */
void process_tables(struct paragraph *paras) {
struct paragraph *P;
for (P = paras; P; ) {
struct table_layout tl;
if (paragraph_is_start_of_table(P, &tl)) {
struct paragraph *first, *last, *rows;
first = P;
for (last = first; last->next && paragraph_is_part_of_table(last->next, &tl, first); last = last->next);
/* Found a table from first to last inclusive. */
rows = process_table_rows(first, last, &tl);
/* A one-paragraph table will have expanded to two. */
if (first == last)
last = first->next;
create_container("table", first, last, 0);
free(tl.is_gap);
free(tl.col);
free(tl.ncols);
}
P = P->next;
}
}
|