/* * tables.c: * Table support for htmlise. * * The idea here is that we look for a paragraph which looks like a table * heading followed by a number of paragraphs which look like table cells. * We do not support sophisticated tables or physical mark-up like borders * and alignment. Tables look like this: * * Heading 1 Heading 2 Heading 3 <-- heading * ---------- ---------- ------------- <-- `table rule' * Text in More text Third cell | * cell in another |<-- first row * cell | * * New row Next cell Bottom-right | * in new row cell of |<-- second row * table | * * The table is recognised because its first paragraph contains a line of * dashes interleaved with spaces, the spaces are present on every line of the * table, and no line in the paragraph is longer than the line of dashes. The * text above the dashes is the table heading, and is marked up as , * and that below is just data, which lives in . Further paragraphs * which have lined-up spaces and consist only of lines of length equal to or * shorter than the header rule length are also considered to be part of the * table, with each paragraph being an individual row. The contents of each * cell are then processed recursively by htmlise. * * Where there are several possible overlapping tables which could be * constructed by the procedure, we pick the earliest-starting one. * * Extensions: * * - should also permit horizontal tables (with the on the left) * * Copyright (c) 2003 Chris Lightfoot. All rights reserved. * Email: chris@ex-parrot.com; WWW: http://www.ex-parrot.com/~chris/ * */ static const char rcsid[] = "$Id: tables.c,v 1.4 2004/01/29 22:52:28 chris Exp $"; #include #include #include #include "htmlise.h" struct table_layout { /* ruleline is the line number of the table rule in the first paragraph of * the table; ruleindent the indent of the table rule and width the line * length of the table rule. ntablecolumns is the number of columns in the * table. */ size_t ruleline, ruleindent, width, ntablecolumns; /* is_gap consists of width flags each of which is true if the * corresponding column must be whitespace in each line of the table. */ char *is_gap; /* col and ncols store the starting column positions and character widths * of the ntablecolumns individual table columns. */ size_t *col, *ncols; }; /* paragraph_is_start_of_table PARAGRAPH LAYOUT * If PARAGRAPH is a possible start-of-table, fill in LAYOUT with the layout of * the table and return 1. Otherwise, return 0. The table rule is set to the * first possible candidate rule. On successful return, LAYOUT->is_gap, col and * ncols are allocated on the heap and must be freed by the caller. */ static int paragraph_is_start_of_table(const struct paragraph *P, struct table_layout *L) { size_t i, j, minindent = 1000000, maxlen = 0, *candidates, ncandidates = 0; char *is_gap = NULL; int ret = 0; candidates = malloc(P->nlines * sizeof *candidates); /* Look at each line in the paragraph and find the minimum indent and * maximum length of the lines. Also identify any candidate table rules. */ for (i = 0; i < P->nlines; ++i) { size_t indent, len, ngaps = 0; indent = strspn(P->lines[i], " "); len = strlen(P->lines[i]); /* Find horizontal extent of paragraph. */ if (indent < minindent) minindent = indent; if (len > maxlen) { is_gap = realloc(is_gap, len); for (j = maxlen; j < len; ++j) is_gap[j] = P->lines[i][j] == ' ' ? 1 : 0; maxlen = len; } /* Find gaps. */ for (j = 0; j < len; ++j) if (P->lines[i][j] != ' ') is_gap[j] = 0; else ++ngaps; /* Is this line a candidate table rule? It can't be the first line of * the paragraph, obviously. */ if (i > 0 && strspn(P->lines[i], "- ") == len && ngaps > 0) candidates[ncandidates++] = i; } /* Now find any candidate table rule which satisfies the constraints. */ for (i = 0; i < ncandidates; ++i) { size_t len; char *line; line = P->lines[candidates[i]]; len = strlen(line); for (j = 0; j < len; ++j) if (line[j] == ' ' && !is_gap[j]) break; if (j == len) { int f; size_t j, n; struct table_layout Lz = { 0 }; *L = Lz; /* Choose this one. */ ret = 1; L->is_gap = is_gap; is_gap = NULL; /* Don't free it. */ L->ruleline = candidates[i]; L->ruleindent = minindent; L->width = maxlen; /* Figure out how many columns we have and where in the line * they are. */ for (j = 0, f = 0, L->ntablecolumns = 0; j < L->width; ++j) { if (!L->is_gap[j] && !f) { f = 1; ++L->ntablecolumns; } else if (L->is_gap[j]) f = 0; } L->col = malloc(L->ntablecolumns * sizeof *L->col); L->ncols = malloc(L->ntablecolumns * sizeof *L->ncols); for (j = 0, n = 0; j < L->width; ++j) { if (!L->is_gap[j] && (j == 0 || L->is_gap[j - 1])) L->col[n] = j; if (L->is_gap[j] && j > 0 && !L->is_gap[j - 1]) { L->ncols[n] = j - L->col[n]; ++n; } } if (n < L->ntablecolumns) { L->ncols[n] = j - L->col[n] + 1; ++n; } assert(n == L->ntablecolumns); break; } } if (candidates) free(candidates); if (is_gap) free(is_gap); return ret; } /* paragraph_is_part_of_table PARAGRAPH LAYOUT FIRST * If PARAGRAPH fits the given table LAYOUT startbing with FIRST, return 1. * Otherwise, return 0. We permit a table to be the contents of a bullet, so if * PARAGRAPH is has the same or smaller indent as FIRST and has a leader, it * cannot be part of the table. */ static int paragraph_is_part_of_table(const struct paragraph *P, const struct table_layout *L, const struct paragraph *first) { size_t i; for (i = 0; i < P->nlines; ++i) { char *line; size_t j, len; line = P->lines[i]; len = strlen(line); if (len > L->width) return 0; for (j = 0; j < len; ++j) if (L->is_gap[j] && line[j] != ' ') return 0; } if (P->type != none && P->indent <= first->indent) return 0; return 1; } /* extract_paragraphs PARAGRAPH STARTLINE NLINES STARTCOL NCOLS * Process the rectangle defined by STARTLINE, NLINES and STARTCOL, NCOLS * in the text of PARAGRAPH, breaking it into individual paragraphs and * returning them as a linked list. If no paragraphs can be extracted, return * NULL. */ static struct paragraph *extract_paragraphs(const struct paragraph *para, const size_t startline, const size_t nlines, const size_t startcol, const size_t ncols) { size_t n; struct paragraph *ret = NULL, *last = NULL, *cur = NULL; char *buf; buf = malloc(ncols + 1); /* Walk through the lines in para, excising the bit of text defined by the * startcol and ncols, and add a copy of same to the current paragraph. */ for (n = startline; n < startline + nlines; ++n) { size_t len; memset(buf, 0, ncols + 1); len = strlen(para->lines[n]); if (len > startcol) { size_t m; m = ncols; if (m > len - startcol) m = len - startcol; memcpy(buf, para->lines[n] + startcol, m); /* Strip trailing whitespace. */ while (m > 0) { if (buf[m - 1] == ' ') buf[m - 1] = 0; else break; --m; } } /* Line ends paragraph. Classify and save it. */ if (!*buf && cur) { classify_paragraph(cur); if (last) { last->next = cur; cur->prev = last; } else last = cur; if (!ret) ret = last; cur = NULL; } if (*buf) { if (!cur) { /* Line starts new paragraph. */ alloc_struct(paragraph, cur); cur->lines = malloc((para->nlines - n) * sizeof *cur->lines); } cur->lines[cur->nlines++] = strdup(buf); } } if (cur) { classify_paragraph(cur); if (last) { last->next = cur; cur->prev = last; } else last = cur; if (!ret) ret = last; } free(buf); /* Recursively process these paragraphs. */ process_tables(ret); return ret; } /* process_table_rows FIRST LAST LAYOUT * Consume paragraphs from FIRST to LAST inclusive which are part of the given * LAYOUT, replacing each paragraph with one or more rows of table cells. */ static struct paragraph *process_table_rows(struct paragraph *first, struct paragraph *last, const struct table_layout *L) { struct paragraph *P, *ret = NULL; int is_first_para = 1, only_one_para; size_t i; /* * Each paragraph except the first represents a single table row, so we * can replace paragraphs with row containers in-place. */ only_one_para = (first == last); P = first; while (1) { struct paragraph *cells = NULL, *l = NULL; size_t startline = 0; if (is_first_para) { /* * The first row of the table is a special case, because of the * rule line, above which is the header and below which may be cell * data. */ struct paragraph *newpara; for (i = 0; i < L->ntablecolumns; ++i) { /* Add a to the row. */ struct paragraph *p_th; alloc_struct(paragraph, p_th); p_th->container = "th"; p_th->contents = extract_paragraphs(P, 0, L->ruleline, L->col[i], L->ncols[i]); p_th->prev = l; if (l) { l->next = p_th; l = l->next; } else { l = cells = p_th; } } /* Because we don't want to move first or last, we insert a new * paragraph after first and replace first with this row. */ alloc_struct(paragraph, newpara); *newpara = *first; newpara->prev = first; first->next = newpara; if (newpara->next) newpara->next->prev = newpara; first->container = "tr"; first->contents = cells; first->nlines = 0; first->lines = NULL; P = newpara; /* Bits under rule line are normal cells. */ startline = L->ruleline + 1; is_first_para = 0; } if (startline < P->nlines) { cells = l = NULL; for (i = 0; i < L->ntablecolumns; ++i) { /* Add a to the row. */ struct paragraph *p_td; alloc_struct(paragraph, p_td); p_td->container = "td"; p_td->contents = extract_paragraphs(P, startline, P->nlines - startline, L->col[i], L->ncols[i]); p_td->prev = l; if (l) { l->next = p_td; l = l->next; } else { l = cells = p_td; } } /* Replace this paragraph's contents. */ P->container = "tr"; P->contents = cells; for (i = 0; i < P->nlines; ++i) free(P->lines[i]); free(P->lines); P->nlines = 0; } if (only_one_para || P == last) break; P = P->next; } return ret; } /* process_tables PARAGRAPHS * Go through the list of PARAGRAPHS, identifying extents which form part of * a table and splitting them up into individual cells. Each cell is processed * recursively by process_tables, so that tables-within-tables may be * implemented (god help us). */ void process_tables(struct paragraph *paras) { struct paragraph *P; for (P = paras; P; ) { struct table_layout tl; if (paragraph_is_start_of_table(P, &tl)) { struct paragraph *first, *last, *rows; first = P; for (last = first; last->next && paragraph_is_part_of_table(last->next, &tl, first); last = last->next); /* Found a table from first to last inclusive. */ rows = process_table_rows(first, last, &tl); /* A one-paragraph table will have expanded to two. */ if (first == last) last = first->next; create_container("table", first, last, 0); free(tl.is_gap); free(tl.col); free(tl.ncols); } P = P->next; } }