/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
/* GMime
* Copyright (C) 2000-2007 Jeffrey Stedfast
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
#include <stdio.h>
#include <string.h>
#include "gtrie.h"
#include "memchunk.h"
#define d(x)
struct _trie_state {
struct _trie_state *next;
struct _trie_state *fail;
struct _trie_match *match;
unsigned int final;
int id;
};
struct _trie_match {
struct _trie_match *next;
struct _trie_state *state;
gunichar c;
};
struct _GTrie {
struct _trie_state root;
GPtrArray *fail_states;
gboolean icase;
MemChunk *match_chunks;
MemChunk *state_chunks;
};
static inline gunichar
trie_utf8_getc (const unsigned char **in, size_t inlen)
{
register const unsigned char *inptr = *in;
const unsigned char *inend = inptr + inlen;
register unsigned char c, r;
register gunichar m, u = 0;
if (inlen == 0)
return 0;
r = *inptr++;
if (r < 0x80) {
*in = inptr;
u = r;
} else if (r < 0xfe) { /* valid start char? */
u = r;
m = 0x7f80; /* used to mask out the length bits */
do {
if (inptr >= inend)
return 0;
c = *inptr++;
if ((c & 0xc0) != 0x80)
goto error;
u = (u << 6) | (c & 0x3f);
r <<= 1;
m <<= 5;
} while (r & 0x40);
*in = inptr;
u &= ~m;
} else {
error:
*in = (*in) + 1;
u = 0xfffe;
}
return u;
}
GTrie *
g_trie_new (gboolean icase)
{
GTrie *trie;
trie = g_new (GTrie, 1);
trie->root.next = NULL;
trie->root.fail = NULL;
trie->root.match = NULL;
trie->root.final = 0;
trie->fail_states = g_ptr_array_sized_new (8);
trie->icase = icase;
trie->match_chunks = memchunk_new (sizeof (struct _trie_match), 8, FALSE);
trie->state_chunks = memchunk_new (sizeof (struct _trie_state), 8, FALSE);
return trie;
}
void
g_trie_free (GTrie *trie)
{
g_ptr_array_free (trie->fail_states, TRUE);
memchunk_destroy (trie->match_chunks);
memchunk_destroy (trie->state_chunks);
g_free (trie);
}
static struct _trie_match *
g (struct _trie_state *s, gunichar c)
{
struct _trie_match *m = s->match;
while (m && m->c != c)
m = m->next;
return m;
}
static struct _trie_state *
trie_insert (GTrie *trie, int depth, struct _trie_state *q, gunichar c)
{
struct _trie_match *m;
m = memchunk_alloc (trie->match_chunks);
m->next = q->match;
m->c = c;
q->match = m;
q = m->state = memchunk_alloc (trie->state_chunks);
q->match = NULL;
q->fail = &trie->root;
q->final = 0;
q->id = 0;
if (trie->fail_states->len < depth + 1) {
unsigned int size = trie->fail_states->len;
size = MAX (size + 64, depth + 1);
g_ptr_array_set_size (trie->fail_states, size);
}
q->next = trie->fail_states->pdata[depth];
trie->fail_states->pdata[depth] = q;
return q;
}
#if d(!)0
static void
dump_trie (struct _trie_state *s, int depth)
{
char *p = g_alloca ((depth * 2) + 1);
struct _trie_match *m;
memset (p, ' ', depth * 2);
p[depth * 2] = '\0';
fprintf (stderr, "%s[state] %p: final=%d; pattern-id=%s; fail=%p\n",
p, s, s->final, s->id, s->fail);
m = s->match;
while (m) {
fprintf (stderr, " %s'%c' -> %p\n", p, m->c, m->state);
if (m->state)
dump_trie (m->state, depth + 1);
m = m->next;
}
}
#endif
/*
* final = empty set
* FOR p = 1 TO #pat
* q = root
* FOR j = 1 TO m[p]
* IF g(q, pat[p][j]) == null
* insert(q, pat[p][j])
* ENDIF
* q = g(q, pat[p][j])
* ENDFOR
* final = union(final, q)
* ENDFOR
*/
void
g_trie_add (GTrie *trie, const char *pattern, int pattern_id)
{
const unsigned char *inptr = (const unsigned char *) pattern;
struct _trie_state *q, *q1, *r;
struct _trie_match *m, *n;
int i, depth = 0;
gunichar c;
/* Step 1: add the pattern to the trie */
q = &trie->root;
while ((c = trie_utf8_getc (&inptr, -1))) {
if (c == 0xfffe) {
g_warning ("Invalid UTF-8 sequence in pattern '%s' at %s",
pattern, (inptr - 1));
continue;
}
if (trie->icase)
c = g_unichar_tolower (c);
m = g (q, c);
if (m == NULL) {
q = trie_insert (trie, depth, q, c);
} else {
q = m->state;
}
depth++;
}
q->final = depth;
q->id = pattern_id;
/* Step 2: compute failure graph */
for (i = 0; i < trie->fail_states->len; i++) {
q = trie->fail_states->pdata[i];
while (q) {
m = q->match;
while (m) {
c = m->c;
q1 = m->state;
r = q->fail;
while (r && (n = g (r, c)) == NULL)
r = r->fail;
if (r != NULL) {
q1->fail = n->state;
if (q1->fail->final > q1->final)
q1->final = q1->fail->final;
} else {
if ((n = g (&trie->root, c)))
q1->fail = n->state;
else
q1->fail = &trie->root;
}
m = m->next;
}
q = q->next;
}
}
d(fprintf (stderr, "\nafter adding pattern '%s' to trie %p:\n", pattern, trie));
d(dump_trie (&trie->root, 0));
}
/*
* Aho-Corasick
*
* q = root
* FOR i = 1 TO n
* WHILE q != fail AND g(q, text[i]) == fail
* q = h(q)
* ENDWHILE
* IF q == fail
* q = root
* ELSE
* q = g(q, text[i])
* ENDIF
* IF isElement(q, final)
* RETURN TRUE
* ENDIF
* ENDFOR
* RETURN FALSE
*/
const char *
g_trie_search (GTrie *trie, const char *buffer, size_t buflen, int *matched_id)
{
const unsigned char *inptr, *inend, *prev, *pat;
register size_t inlen = buflen;
struct _trie_state *q;
struct _trie_match *m = NULL;
gunichar c;
inptr = (const unsigned char *) buffer;
inend = inptr + buflen;
q = &trie->root;
pat = prev = inptr;
while ((c = trie_utf8_getc (&inptr, inlen))) {
inlen = (inend - inptr);
if (c == 0xfffe) {
prev = (inptr - 1);
pat = (const unsigned char *) buffer + buflen;
g_warning ("Invalid UTF-8 in buffer '%.*s' at %.*s",
buflen, buffer, pat - prev, prev);
pat = prev = inptr;
}
if (trie->icase)
c = g_unichar_tolower (c);
while (q != NULL && (m = g (q, c)) == NULL)
q = q->fail;
if (q == &trie->root)
pat = prev;
if (q == NULL) {
q = &trie->root;
pat = inptr;
} else if (m != NULL) {
q = m->state;
if (q->final) {
if (matched_id)
*matched_id = q->id;
return (const char *) pat;
}
}
prev = inptr;
}
return NULL;
}
#ifdef TEST
static char *patterns[] = {
"news://",
"nntp://",
"telnet://",
"file://",
"ftp://",
"http://",
"https://",
"www.",
"ftp.",
"mailto:",
"@"
};
static char *haystacks[] = {
"try this url: http://www.ximian.com",
"or, feel free to email me at fejj@ximian.com",
"don't forget to check out www.ximian.com",
"I've attached a file (file:///cvs/gmime/gmime/gtrie.c)",
};
int main (int argc, char **argv)
{
const char *match;
GTrie *trie;
int id, i;
trie = g_trie_new (TRUE);
for (i = 0; i < (sizeof (patterns) / sizeof (patterns[0])); i++)
g_trie_add (trie, patterns[i], i);
for (i = 0; i < (sizeof (haystacks) / sizeof (haystacks[0])); i++) {
if ((match = g_trie_search (trie, haystacks[i], -1, &id))) {
fprintf (stderr, "matched @ '%s' with pattern '%s'\n", match, patterns[id]);
} else {
fprintf (stderr, "no match\n");
}
}
fflush (stdout);
g_trie_free (trie);
return match == NULL ? 0 : 1;
}
#endif /* TEST */
syntax highlighted by Code2HTML, v. 0.9.1