[w3m-dev 02811] new regexp implementation

From: aito@fw.ipsj.or.jp
This commit is contained in:
Fumitoshi UKAI
2002-01-10 04:55:06 +00:00
parent 3d272a74fc
commit 31d84e0083
4 changed files with 535 additions and 161 deletions

View File

@@ -1,3 +1,26 @@
2002-01-10 aito@fw.ipsj.or.jp
* [w3m-dev 02811] new regexp implementation
* regex.h: remove defines
* regex.h: struct regexchar, struct regex
* regex.c: include stdio.h, stdlib.h, string.h, config.h
* regex.c: remove include fm.h
* regex.c (NULL JP_CHARSET IS_ALPHA IS_KANJI1): define unless defined
* regex.c (RE_*) define
* regex.c (st_ptr): deleted
* regex.c (regmatch): remove 4th arg `int'
* regex.c (newRegex0): rewrite, based on old newRegex
* regex.c (newRegex): use newRegex0()
* regex.c (RegexMatch): rewrite
* regex.c (struct MatchingContext1): added
* regex.c (struct MatchingContext2): added
* regex.c (YIELD): added
* regex.c (regmatch_sub_anytime): added
* regex.c (regmatch_iter): added
* regex.c (regmatch): use regmatch_iter()
* regex.c (regmatch1): rewrite
* regex.c (lc2c): add 2nd arg `len', rewrite
2002-01-10 Yoshinobu Sakane <sakane@d4.bsd.nes.nec.co.jp> 2002-01-10 Yoshinobu Sakane <sakane@d4.bsd.nes.nec.co.jp>
* [w3m-dev 02810] * [w3m-dev 02810]
@@ -1795,4 +1818,4 @@
* release-0-2-1 * release-0-2-1
* import w3m-0.2.1 * import w3m-0.2.1
$Id: ChangeLog,v 1.202 2002/01/10 03:45:13 ukai Exp $ $Id: ChangeLog,v 1.203 2002/01/10 04:55:06 ukai Exp $

5
NEWS
View File

@@ -1,3 +1,8 @@
w3m 0.3?
* new regex implementation
----------------------------------------------------------------
w3m 0.2.4 - 2002-01-07 w3m 0.2.4 - 2002-01-07
* RFC2818 server identity check * RFC2818 server identity check

631
regex.c
View File

@@ -1,18 +1,60 @@
/* $Id: regex.c,v 1.6 2001/11/30 10:10:24 ukai Exp $ */ /* $Id: regex.c,v 1.7 2002/01/10 04:55:07 ukai Exp $ */
/* /*
* regex: Regular expression pattern match library * regex: Regular expression pattern match library
* *
* by A.ITO, December 1989 * by A.ITO, December 1989
* Revised by A.ITO, January 2002
*/ */
#ifdef REGEX_DEBUG #ifdef REGEX_DEBUG
#include <sys/types.h> #include <sys/types.h>
#include <malloc.h> #include <malloc.h>
#endif /* REGEX_DEBUG */ #endif /* REGEX_DEBUG */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h> #include <ctype.h>
#include <gc.h> #include <gc.h>
#include "fm.h"
#include "regex.h" #include "regex.h"
#include "config.h"
#ifndef NULL
#define NULL 0
#endif /* not NULL */
#if LANG == JA
#define JP_CHARSET
#endif
#define RE_ITER_LIMIT 65535
#define RE_MATCHMODE 0x07
#define RE_NORMAL 0x00
#define RE_ANY 0x01
#define RE_WHICH 0x02
#define RE_EXCEPT 0x03
#define RE_SUBREGEX 0x04
#define RE_BEGIN 0x05
#define RE_END 0x06
#define RE_ENDMARK 0x07
#define RE_OPT 0x08
#define RE_ANYTIME 0x10
#define RE_IGNCASE 0x40
#define RE_MODE(x) ((x)->mode&RE_MATCHMODE)
#define RE_SET_MODE(x,v) ((x)->mode = (((x)->mode&~RE_MATCHMODE)|((v)&RE_MATCHMODE)))
#ifdef REGEX_DEBUG
void debugre(regexchar *);
char *lc2c(longchar *, int);
int verbose;
#endif /* REGEX_DEBUG */
#ifndef IS_ALPHA
#define IS_ALPHA(x) (!((x)&0x80) && isalpha(x))
#define IS_KANJI1(x) ((x)&0x80)
#endif
#ifdef JP_CHARSET #ifdef JP_CHARSET
#define RE_KANJI(p) (((unsigned char)*(p) << 8) | (unsigned char)*((p)+1)) #define RE_KANJI(p) (((unsigned char)*(p) << 8) | (unsigned char)*((p)+1))
@@ -24,13 +66,10 @@ static Regex DefaultRegex;
#define CompiledRegex DefaultRegex.re #define CompiledRegex DefaultRegex.re
#define Cstorage DefaultRegex.storage #define Cstorage DefaultRegex.storage
static longchar *st_ptr; static int regmatch(regexchar *, char *, int, char **);
static int regmatch(regexchar *, char *, int, int, char **);
static int regmatch1(regexchar *, longchar); static int regmatch1(regexchar *, longchar);
static int matchWhich(longchar *, longchar); static int matchWhich(longchar *, longchar);
/* /*
* regexCompile: compile regular expression * regexCompile: compile regular expression
*/ */
@@ -42,41 +81,66 @@ regexCompile(char *ex, int igncase)
return msg; return msg;
} }
Regex * static Regex *
newRegex(char *ex, int igncase, Regex *regex, char **msg) newRegex0(char **ex, int igncase, Regex *regex, char **msg, int level)
{ {
char *p; char *p;
longchar *r; longchar *r;
regexchar *re = regex->re - 1; regexchar *re;
int m; int m;
longchar *st_ptr;
if (regex == 0) if (regex == NULL)
regex = (Regex *)GC_malloc_atomic(sizeof(Regex)); regex = (Regex *)GC_malloc(sizeof(Regex));
regex->alt_regex = NULL;
re = regex->re;
st_ptr = regex->storage; st_ptr = regex->storage;
for (p = ex; *p != '\0'; p++) { for (p = *ex; *p != '\0'; p++) {
re->mode = 0;
switch (*p) { switch (*p) {
case '.': case '.':
re->p.pattern = NULL;
RE_SET_MODE(re, RE_ANY);
re++; re++;
re->pattern = NULL;
re->mode = RE_ANY;
break; break;
case '$': case '$':
re->p.pattern = NULL;
RE_SET_MODE(re, RE_END);
re++; re++;
re->pattern = NULL;
re->mode = RE_END;
break; break;
case '^': case '^':
re->p.pattern = NULL;
RE_SET_MODE(re, RE_BEGIN);
re++; re++;
re->pattern = NULL;
re->mode = RE_BEGIN;
break; break;
case '*': case '+':
if (!(re->mode & RE_ANY) && re->pattern == NULL) { if (re == regex->re ||
(RE_MODE(re - 1) != RE_ANY && (re - 1)->p.pattern == NULL)) {
if (msg) if (msg)
*msg = "Invalid regular expression"; *msg = "Invalid regular expression";
return NULL; return NULL;
} }
*re = *(re - 1);
re->mode |= RE_ANYTIME; re->mode |= RE_ANYTIME;
re++;
break;
case '*':
if (re == regex->re ||
(RE_MODE(re - 1) != RE_ANY && (re - 1)->p.pattern == NULL)) {
if (msg)
*msg = "Invalid regular expression";
return NULL;
}
(re - 1)->mode |= RE_ANYTIME;
break;
case '?':
if (re == regex->re ||
(RE_MODE(re - 1) != RE_ANY && (re - 1)->p.pattern == NULL)) {
if (msg)
*msg = "Invalid regular expression";
return NULL;
}
(re - 1)->mode |= RE_OPT;
break; break;
case '[': case '[':
r = st_ptr; r = st_ptr;
@@ -110,14 +174,40 @@ newRegex(char *ex, int igncase, Regex *regex, char **msg)
*(st_ptr++) = (unsigned char)*(p++); *(st_ptr++) = (unsigned char)*(p++);
} }
*(st_ptr++) = '\0'; *(st_ptr++) = '\0';
re->p.pattern = r;
RE_SET_MODE(re, m);
re++; re++;
re->pattern = r;
re->mode = m;
break; break;
case '|':
RE_SET_MODE(re, RE_ENDMARK);
re++;
p++;
regex->alt_regex = newRegex0(&p, igncase, NULL, msg, level);
if (regex->alt_regex == NULL)
return NULL;
*ex = p;
return regex;
case '(':
RE_SET_MODE(re, RE_SUBREGEX);
p++;
re->p.sub = newRegex0(&p, igncase, NULL, msg, level + 1);
if (re->p.sub == NULL)
return NULL;
re++;
break;
case ')':
if (level == 0) {
if (msg)
*msg = "Too many ')'";
return NULL;
}
RE_SET_MODE(re, RE_ENDMARK);
re++;
*ex = p;
return regex;
case '\\': case '\\':
p++; p++;
default: default:
re++;
#ifdef JP_CHARSET #ifdef JP_CHARSET
if (IS_KANJI1(*p)) { if (IS_KANJI1(*p)) {
*(st_ptr) = RE_KANJI(p); *(st_ptr) = RE_KANJI(p);
@@ -126,26 +216,33 @@ newRegex(char *ex, int igncase, Regex *regex, char **msg)
else else
#endif #endif
*st_ptr = (unsigned char)*p; *st_ptr = (unsigned char)*p;
re->pattern = st_ptr; re->p.pattern = st_ptr;
st_ptr++; st_ptr++;
re->mode = RE_NORMAL; RE_SET_MODE(re, RE_NORMAL);
if (igncase) if (igncase)
re->mode |= RE_IGNCASE; re->mode |= RE_IGNCASE;
re++;
} }
if (st_ptr >= &Cstorage[STORAGE_MAX] || if (st_ptr >= &regex->storage[STORAGE_MAX] ||
re >= &CompiledRegex[REGEX_MAX]) { re >= &regex->re[REGEX_MAX]) {
if (msg) if (msg)
*msg = "Regular expression too long"; *msg = "Regular expression too long";
return NULL; return NULL;
} }
} }
re++; RE_SET_MODE(re, RE_ENDMARK);
re->mode = RE_ENDMARK;
if (msg) if (msg)
*msg = NULL; *msg = NULL;
*ex = p;
return regex; return regex;
} }
Regex *
newRegex(char *ex, int igncase, Regex *regex, char **msg)
{
return newRegex0(&ex, igncase, regex, msg, 0);
}
/* /*
* regexMatch: match regular expression * regexMatch: match regular expression
*/ */
@@ -159,20 +256,33 @@ int
RegexMatch(Regex *re, char *str, int len, int firstp) RegexMatch(Regex *re, char *str, int len, int firstp)
{ {
char *p, *ep; char *p, *ep;
char *lpos;
Regex *r;
if (str == NULL) if (str == NULL)
return 0; return 0;
if (len == 0)
len = strlen(str);
re->position = NULL; re->position = NULL;
ep = str + ((len == 0) ? strlen(str) : len); ep = str + len;
for (p = str; p < ep; p++) { for (p = str; p < ep; p++) {
switch (regmatch lpos = NULL;
(re->re, p, ep - p, firstp && (p == str), &re->lposition)) { re->lposition = NULL;
case 1: for (r = re; r != NULL; r = r->alt_regex) {
re->position = p; switch (regmatch(r->re, p, firstp && (p == str), &lpos)) {
case 1: /* matched */
re->position = p;
if (re->lposition == NULL || re->lposition < lpos)
re->lposition = lpos;
break;
case -1: /* error */
re->position = NULL;
return -1;
}
}
if (re->lposition != NULL) {
/* matched */
return 1; return 1;
case -1:
re->position = NULL;
return -1;
} }
#ifdef JP_CHARSET #ifdef JP_CHARSET
if (IS_KANJI1(*p)) if (IS_KANJI1(*p))
@@ -202,118 +312,306 @@ matchedPosition(char **first, char **last)
/* /*
* Intermal routines * Intermal routines
*/ */
static int
regmatch(regexchar * re, char *str, int len, int firstp, char **lastpos)
{
char *p = str, *ep = str + len;
char *lpos, *llpos = NULL;
longchar k;
*lastpos = NULL; struct MatchingContext1 {
#ifdef REGEX_DEBUG int label;
debugre(re, str); regexchar *re;
#endif /* REGEX_DEBUG */ char *lastpos;
while ((re->mode & RE_ENDMARK) == 0) { char *str;
if (re->mode & RE_BEGIN) { int iter_limit;
if (!firstp) int n_any;
return 0; int firstp;
re++; char *end_p;
Regex *sub_regex;
struct MatchingContext1 *sub_ctx;
struct MatchingContext2 *ctx2;
};
struct MatchingContext2 {
int label;
Regex *regex;
char *lastpos;
struct MatchingContext1 *ctx;
struct MatchingContext2 *ctx2;
char *str;
int n_any;
int firstp;
};
#define YIELD(retval,context,lnum) (context)->label = lnum; return (retval); label##lnum:
static int regmatch_iter(struct MatchingContext1 *, regexchar *, char *, int);
static int
regmatch_sub_anytime(struct MatchingContext2 *c, Regex *regex,
regexchar * pat2, char *str, int iter_limit, int firstp)
{
switch (c->label) {
case 1:
goto label1;
case 2:
goto label2;
case 3:
goto label3;
}
c->ctx = GC_malloc(sizeof(struct MatchingContext1));
c->ctx2 = GC_malloc(sizeof(struct MatchingContext2));
c->ctx->label = 0;
c->regex = regex;
c->n_any = 0;
c->str = str;
c->firstp = firstp;
for (;;) {
c->ctx->label = 0;
while (regmatch_iter(c->ctx, c->regex->re, c->str, c->firstp)) {
c->n_any = c->ctx->lastpos - c->str;
if (c->n_any <= 0)
continue;
c->firstp = 0;
if (RE_MODE(pat2) == RE_ENDMARK) {
c->lastpos = c->str + c->n_any;
YIELD(1, c, 1);
}
else if (regmatch(pat2, c->str + c->n_any,
c->firstp, &c->lastpos) == 1) {
YIELD(1, c, 2);
}
if (iter_limit == 1)
continue;
c->ctx2->label = 0;
while (regmatch_sub_anytime(c->ctx2, regex, pat2,
c->str + c->n_any, iter_limit - 1,
c->firstp)) {
c->lastpos = c->ctx2->lastpos;
YIELD(1, c, 3);
}
} }
else if (re->mode & RE_ANYTIME) { if (c->regex->alt_regex == NULL)
short matched, ok = 0; break;
for (;;) { c->regex = c->regex->alt_regex;
matched = 0; }
if (regmatch(re + 1, p, ep - p, firstp, &lpos) == 1) { return 0;
llpos = lpos; }
matched = 1;
ok = 1; static int
regmatch_iter(struct MatchingContext1 *c,
regexchar * re, char *str, int firstp)
{
switch (c->label) {
case 1:
goto label1;
case 2:
goto label2;
case 3:
goto label3;
case 4:
goto label4;
case 5:
goto label5;
case 6:
goto label6;
case 7:
goto label7;
}
if (RE_MODE(re) == RE_ENDMARK)
return 0;
c->re = re;
c->end_p = str + strlen(str);
c->firstp = firstp;
c->str = str;
c->sub_ctx = NULL;
while (RE_MODE(c->re) != RE_ENDMARK) {
if (c->re->mode & (RE_ANYTIME | RE_OPT)) {
if (c->re->mode & RE_ANYTIME)
c->iter_limit = RE_ITER_LIMIT;
else
c->iter_limit = 1;
c->n_any = -1;
while (c->n_any < c->iter_limit) {
if (c->str + c->n_any >= c->end_p) {
return 0;
} }
if (p >= ep) if (c->n_any >= 0) {
break; if (RE_MODE(c->re) == RE_SUBREGEX) {
#ifdef JP_CHARSET c->ctx2 = GC_malloc(sizeof(struct MatchingContext2));
if (IS_KANJI1(*p)) { c->ctx2->label = 0;
k = RE_KANJI(p); while (regmatch_sub_anytime(c->ctx2,
if (regmatch1(re, k)) { c->re->p.sub,
if (lastpos != NULL) c->re + 1,
*lastpos = llpos; c->str + c->n_any,
p += 2; c->iter_limit,
c->firstp)) {
c->n_any = c->ctx2->lastpos - c->str;
c->lastpos = c->ctx2->lastpos;
YIELD(1, c, 1);
}
return 0;
}
#ifdef JP_CHARSET
else if (IS_KANJI1(c->str[c->n_any])) {
longchar k;
k = RE_KANJI(c->str + c->n_any);
if (regmatch1(c->re, k)) {
c->n_any += 2;
}
else {
return 0;
}
c->firstp = 0;
}
#endif
else {
longchar k;
k = (unsigned char)c->str[c->n_any];
if (regmatch1(c->re, k)) {
c->n_any++;
}
else {
return 0;
}
c->firstp = 0;
} }
else
break;
} }
else else
#endif c->n_any++;
{ if (RE_MODE(c->re + 1) == RE_ENDMARK) {
k = (unsigned char)*p; c->lastpos = c->str + c->n_any;
if (regmatch1(re, k)) { YIELD(1, c, 2);
p++; }
if (lastpos != NULL) else if (regmatch(c->re + 1, c->str + c->n_any,
*lastpos = llpos; c->firstp, &c->lastpos) == 1) {
} YIELD(1, c, 3);
else
break;
} }
} }
if (lastpos != NULL) return 0;
*lastpos = llpos;
return ok;
} }
else if (re->mode & RE_END) { /* regexp other than pat*, pat+ and pat? */
if (lastpos != NULL) if (c->str >= c->end_p)
*lastpos = p; return 0;
return (p >= ep); switch (RE_MODE(c->re)) {
} case RE_BEGIN:
else { if (!c->firstp)
int a; return 0;
c->re++;
break;
case RE_END:
c->lastpos = c->str;
c->re++;
YIELD((c->str >= c->end_p), c, 4);
break;
case RE_SUBREGEX:
if (c->sub_ctx == NULL) {
c->sub_ctx = GC_malloc(sizeof(struct MatchingContext1));
}
c->sub_regex = c->re->p.sub;
for (;;) {
c->sub_ctx->label = 0;
while (regmatch_iter(c->sub_ctx, c->sub_regex->re,
c->str, c->firstp)) {
if (c->sub_ctx->lastpos != c->str)
c->firstp = 0;
if (RE_MODE(c->re + 1) == RE_ENDMARK) {
c->lastpos = c->sub_ctx->lastpos;
YIELD(1, c, 5);
}
else if (regmatch(c->re + 1, c->sub_ctx->lastpos,
c->firstp, &c->lastpos) == 1) {
YIELD(1, c, 6);
}
}
if (c->sub_regex->alt_regex == NULL)
break;
c->sub_regex = c->sub_regex->alt_regex;
}
return 0;
default:
#ifdef JP_CHARSET #ifdef JP_CHARSET
if (IS_KANJI1(*p)) { if (IS_KANJI1(*c->str)) {
k = RE_KANJI(p); longchar k;
p += 2; k = RE_KANJI(c->str);
a = regmatch1(re, k); c->str += 2;
if (!regmatch1(c->re, k))
return 0;
} }
else else
#endif #endif
{ {
k = (unsigned char)*(p++); longchar k;
a = regmatch1(re, k); k = (unsigned char)*(c->str++);
if (!regmatch1(c->re, k))
return 0;
} }
if (!a) c->re++;
return 0; c->firstp = 0;
else
re++;
} }
} }
if (lastpos != NULL) c->lastpos = c->str;
*lastpos = p; #ifdef REGEX_DEBUG
if (verbose)
printf("Succeed: %s %d\n", c->str, c->lastpos - c->str);
#endif
YIELD(1, c, 7);
return 0;
}
static int
regmatch(regexchar * re, char *str, int firstp, char **lastpos)
{
struct MatchingContext1 contx;
*lastpos = NULL;
contx.label = 0;
while (regmatch_iter(&contx, re, str, firstp)) {
#ifdef REGEX_DEBUG
char *p;
if (verbose) {
printf("regmatch: matched <");
for (p = str; p < contx.lastpos; p++)
putchar(*p);
printf(">\n");
}
#endif
if (*lastpos == NULL || *lastpos < contx.lastpos)
*lastpos = contx.lastpos;
}
if (*lastpos == NULL)
return 0;
return 1; return 1;
} }
static int static int
regmatch1(regexchar * re, longchar c) regmatch1(regexchar * re, longchar c)
{ {
switch (re->mode & RE_MATCHMODE) { switch (RE_MODE(re)) {
case RE_ANY: case RE_ANY:
#ifdef REGEX_DEBUG #ifdef REGEX_DEBUG
printf("%c vs any. -> 1\n", c); if (verbose)
printf("%c vs any. -> 1\n", c);
#endif /* REGEX_DEBUG */ #endif /* REGEX_DEBUG */
return 1; return 1;
case RE_NORMAL: case RE_NORMAL:
#ifdef REGEX_DEBUG #ifdef REGEX_DEBUG
printf("RE=%c vs %c -> %d\n", *re->pattern, c, *re->pattern == c); if (verbose)
printf("RE=%c vs %c -> %d\n", *re->p.pattern, c,
*re->p.pattern == c);
#endif /* REGEX_DEBUG */ #endif /* REGEX_DEBUG */
if (re->mode & RE_IGNCASE) { if (re->mode & RE_IGNCASE) {
if (*re->pattern < 127 && c < 127 && if (*re->p.pattern < 127 && c < 127 &&
IS_ALPHA(*re->pattern) && IS_ALPHA(c)) IS_ALPHA(*re->p.pattern) && IS_ALPHA(c))
return tolower(*re->pattern) == tolower(c); return tolower(*re->p.pattern) == tolower(c);
else else
return *re->pattern == c; return *re->p.pattern == c;
} }
else else
return (*re->pattern == c); return (*re->p.pattern == c);
case RE_WHICH: case RE_WHICH:
return matchWhich(re->pattern, c); return matchWhich(re->p.pattern, c);
case RE_EXCEPT: case RE_EXCEPT:
return !matchWhich(re->pattern, c); return !matchWhich(re->p.pattern, c);
} }
return 0; return 0;
} }
@@ -325,7 +623,8 @@ matchWhich(longchar * pattern, longchar c)
int ans = 0; int ans = 0;
#ifdef REGEX_DEBUG #ifdef REGEX_DEBUG
printf("RE pattern = %s char=%c", pattern, c); if (verbose)
printf("RE pattern = %s char=%s", lc2c(pattern, 10000), lc2c(&c, 1));
#endif /* REGEX_DEBUG */ #endif /* REGEX_DEBUG */
while (*p != '\0') { while (*p != '\0') {
if (*(p + 1) == RE_WHICH_RANGE && *(p + 2) != '\0') { /* Char class. */ if (*(p + 1) == RE_WHICH_RANGE && *(p + 2) != '\0') { /* Char class. */
@@ -344,64 +643,130 @@ matchWhich(longchar * pattern, longchar c)
} }
} }
#ifdef REGEX_DEBUG #ifdef REGEX_DEBUG
printf(" -> %d\n", ans); if (verbose)
printf(" -> %d\n", ans);
#endif /* REGEX_DEBUG */ #endif /* REGEX_DEBUG */
return ans; return ans;
} }
#ifdef REGEX_DEBUG #ifdef REGEX_DEBUG
char * char *
lc2c(longchar * x) lc2c(longchar * x, int len)
{ {
static char y[100]; static char y[100];
int i = 0; int i = 0;
char *r;
while (x[i]) { while (x[i] && i < len) {
if (x[i] == RE_WHICH_RANGE) if (x[i] == RE_WHICH_RANGE)
y[i] = '-'; y[i++] = '-';
else if (x[i] >= 128) {
y[i++] = ((x[i] >> 8) & 0xff);
y[i++] = (x[i] & 0xff);
}
else else
y[i] = x[i]; y[i++] = x[i];
i++;
} }
y[i] = '\0'; y[i] = '\0';
return y; r = GC_malloc_atomic(i + 1);
strcpy(r, y);
return r;
} }
void void
debugre(re, s) debugre(regexchar * re)
regexchar *re;
char *s;
{ {
for (; !(re->mode & RE_ENDMARK); re++) { for (; RE_MODE(re) != RE_ENDMARK; re++) {
if (re->mode & RE_BEGIN) { switch (RE_MODE(re)) {
case RE_BEGIN:
printf("Begin "); printf("Begin ");
continue; continue;
} case RE_END:
else if (re->mode & RE_END) {
printf("End "); printf("End ");
continue; continue;
} }
if (re->mode & RE_ANYTIME) if (re->mode & RE_ANYTIME)
printf("Anytime-"); printf("Anytime-");
if (re->mode & RE_OPT)
printf("Opt-");
switch (re->mode & RE_MATCHMODE) { switch (RE_MODE(re)) {
case RE_ANY: case RE_ANY:
printf("Any "); printf("Any ");
break; break;
case RE_NORMAL: case RE_NORMAL:
printf("Match-to'%c' ", *re->pattern); printf("Match-to'%c' ", *re->p.pattern);
break; break;
case RE_WHICH: case RE_WHICH:
printf("One-of\"%s\" ", lc2c(re->pattern)); printf("One-of\"%s\" ", lc2c(re->p.pattern, 10000));
break; break;
case RE_EXCEPT: case RE_EXCEPT:
printf("Other-than\"%s\" ", lc2c(re->pattern)); printf("Other-than\"%s\" ", lc2c(re->p.pattern, 10000));
break; break;
case RE_SUBREGEX:
{
Regex *r = re->p.sub;
printf("(");
while (r) {
debugre(r->re);
if (r->alt_regex)
printf(" | ");
r = r->alt_regex;
}
printf(")");
break;
}
default: default:
printf("Unknown "); printf("Unknown ");
} }
} }
putchar('\n');
} }
#endif /* REGEX_DEBUG */ #endif /* REGEX_DEBUG */
#ifdef REGEXTEST
int
main(int argc, char **argv)
{
char buf[128], buf2[128];
char *msg;
Regex *re;
char *fpos, *epos;
FILE *f = stdin;
int i = 1;
#ifdef REGEX_DEBUG
for (i = 1; i < argc; i++) {
if (strcmp(argv[i], "-v") == 0)
verbose = 1;
else
break;
}
#endif
if (argc > i)
f = fopen(argv[i], "r");
if (f == NULL) {
fprintf(stderr, "Can't open %s\n", argv[i]);
exit(1);
}
while (fscanf(f, "%s%s", buf, buf2) == 2) {
re = newRegex(buf, 0, NULL, &msg);
if (re == NULL) {
printf("Error on regexp /%s/: %s\n", buf, msg);
exit(1);
}
if (RegexMatch(re, buf2, 0, 1)) {
printf("/%s/\t%s\t", buf, buf2);
MatchedPosition(re, &fpos, &epos);
while (fpos < epos)
putchar(*(fpos++));
}
else
printf("/%s/\t%s\tno_match", buf, buf2);
putchar('\n');
}
/* notreatched */
return 0;
}
#endif

35
regex.h
View File

@@ -1,44 +1,25 @@
/* $Id: regex.h,v 1.3 2001/11/24 02:01:26 ukai Exp $ */ /* $Id: regex.h,v 1.4 2002/01/10 04:55:07 ukai Exp $ */
#define REGEX_MAX 64 #define REGEX_MAX 64
#define STORAGE_MAX 256 #define STORAGE_MAX 256
#ifndef NULL
#define NULL 0
#endif /* not NULL */
#define RE_NORMAL 0
#define RE_MATCHMODE 0x07
#define RE_ANY 0x01
#define RE_WHICH 0x02
#define RE_EXCEPT 0x04
#define RE_ANYTIME 0x08
#define RE_BEGIN 0x10
#define RE_END 0x20
#define RE_IGNCASE 0x40
#define RE_ENDMARK 0x80
typedef unsigned short longchar; typedef unsigned short longchar;
typedef struct regexchar {
typedef struct { union {
longchar *pattern;
longchar *pattern; struct regex *sub;
} p;
unsigned char mode; unsigned char mode;
} regexchar; } regexchar;
typedef struct { typedef struct regex {
regexchar re[REGEX_MAX]; regexchar re[REGEX_MAX];
longchar storage[STORAGE_MAX]; longchar storage[STORAGE_MAX];
char *position; char *position;
char *lposition; char *lposition;
struct regex *alt_regex;
} Regex; } Regex;