[w3m-dev 03509] HTML parser

* file.c (close_textarea): delete
	(HTMLtagproc1): rewrite
			delete HTML_EOL
			move HTML_LISTING, HTML_N_LISTING
			add HTML_PRE_PLAIN, HTML_N_PRE_PLAIN
			add HTML_PLAINTEXT
			end_tag
	(HTMLlineproc0): s/str/line/
			rewrite
	(completeHTMLstream): </textarea> if necessary
* fm.h (struct readbuffer): delete ignore_tag
			add end_tag
	(RB_XMPMODE): deleted
	(RB_LSTMODE): deleted
	(RB_SCRIPT): added
	(RB_STYLE): added
	(RB_*): renumber
	(R_ST_EOL): added
	(R_ST_*): renumber
	(ST_IS_TAG): check R_ST_EOL
* form.c (form_fputs_decode): remove <eol> handling
* frame.c (newFrame): remove_space()
	(CASE_TABLE_TAG): added
	(createFrameFile): rewrite
* html.c (TagMAP): delete eol
		add pre_plain, /pre_plain
* html.h (HTML_EOL): deleted
	(HTML_PRE_PLAIN): added
	(HTML_N_PRE_PLAIN): added
* table.c (visible_length): rewrite
	(visible_length_plain): added
	(maximum_visible_length_plain): added
	(do_refill): R_ST_EOL
	(table_close_select): end_tag
	(table_close_textarea): end_tag
	(TAG_ACTION_PLAIN): added
	(feed_table_tag): rewrite
	(feed_table): rewrite
* table.h (TBLM_*) reassign
	(struct table_mode): delete ignore_tag
			add end_tag
* tagtable.tab (eol): deleted
		(pre_plain): added
		(/pre_plain): added
From: Hironori SAKAMOTO <hsaka@mth.biglobe.ne.jp>
This commit is contained in:
Fumitoshi UKAI
2002-12-03 15:35:09 +00:00
parent 49098b36fe
commit 569d535b29
10 changed files with 565 additions and 379 deletions

160
frame.c
View File

@@ -1,4 +1,4 @@
/* $Id: frame.c,v 1.22 2002/11/28 16:00:34 ukai Exp $ */
/* $Id: frame.c,v 1.23 2002/12/03 15:35:10 ukai Exp $ */
#include "fm.h"
#include "parsetagx.h"
#include "myctype.h"
@@ -97,7 +97,7 @@ newFrame(struct parsed_tag *tag, Buffer *buf)
body->baseURL = baseURL(buf);
if (tag) {
if (parsedtag_get_value(tag, ATTR_SRC, &p))
body->url = url_quote_conv(p, buf->document_code);
body->url = url_quote_conv(remove_space(p), buf->document_code);
if (parsedtag_get_value(tag, ATTR_NAME, &p) && *p != '_')
body->name = url_quote_conv(p, buf->document_code);
}
@@ -412,6 +412,23 @@ frame_download_source(struct frame_body *b, ParsedURL *currentURL,
return ret_frameset;
}
#define CASE_TABLE_TAG \
case HTML_TR:\
case HTML_N_TR:\
case HTML_TD:\
case HTML_N_TD:\
case HTML_TH:\
case HTML_N_TH:\
case HTML_THEAD:\
case HTML_N_THEAD:\
case HTML_TBODY:\
case HTML_N_TBODY:\
case HTML_TFOOT:\
case HTML_N_TFOOT:\
case HTML_COLGROUP:\
case HTML_N_COLGROUP:\
case HTML_COL
static int
createFrameFile(struct frameset *f, FILE * f1, Buffer *current, int level,
int force_reload)
@@ -467,8 +484,10 @@ createFrameFile(struct frameset *f, FILE * f1, Buffer *current, int level,
struct frameset *f_frameset;
int i = c + r * f->col;
char *p = "";
int status = R_ST_NORMAL;
Str tok = Strnew();
int status;
int pre_mode = 0;
int end_tag = 0;
frame = f->frame[i];
@@ -557,12 +576,13 @@ createFrameFile(struct frameset *f, FILE * f1, Buffer *current, int level,
break;
}
do {
status = R_ST_NORMAL;
int is_tag = FALSE;
char *q;
struct parsed_tag *tag;
do {
if (*p == '\0') {
Str tmp = StrmyUFgets(&f2);
if (tmp->length == 0 && status != R_ST_NORMAL)
tmp = correct_irrtag(status);
if (tmp->length == 0)
break;
#ifdef JP_CHARSET
@@ -573,21 +593,67 @@ createFrameFile(struct frameset *f, FILE * f1, Buffer *current, int level,
cleanup_line(tmp, HTML_MODE);
p = tmp->ptr;
}
if (status == R_ST_NORMAL)
read_token(tok, &p, &status, 1, 0);
else if (ST_IS_COMMENT(status))
read_token(tok, &p, &status, 0, 0);
else
read_token(tok, &p, &status, 1, 1);
read_token(tok, &p, &status, 1, status != R_ST_NORMAL);
} while (status != R_ST_NORMAL);
if (tok->length == 0)
continue;
if (tok->ptr[0] == '<') {
is_tag = TRUE;
if (pre_mode & (RB_PLAIN | RB_INTXTA | RB_SCRIPT |
RB_STYLE)) {
q = tok->ptr;
if ((tag = parse_tag(&q, FALSE)) &&
tag->tagid == end_tag) {
if (pre_mode & RB_PLAIN) {
fputs("</PRE_PLAIN>", f1);
pre_mode = 0;
end_tag = 0;
goto token_end;
}
pre_mode = 0;
end_tag = 0;
goto proc_normal;
}
if (strncmp(tok->ptr, "<!--", 4) &&
(q = strchr(tok->ptr + 1, '<'))) {
tok = Strnew_charp_n(tok->ptr, q - tok->ptr);
p = Strnew_m_charp(q, p, NULL)->ptr;
status = R_ST_NORMAL;
}
is_tag = FALSE;
}
else if (pre_mode & RB_INSELECT) {
q = tok->ptr;
if ((tag = parse_tag(&q, FALSE))) {
if ((tag->tagid == end_tag) ||
(tag->tagid == HTML_N_FORM)) {
if (tag->tagid == HTML_N_FORM)
fputs("</SELECT>", f1);
pre_mode = 0;
end_tag = 0;
goto proc_normal;
}
if (t_stack) {
switch (tag->tagid) {
case HTML_TABLE:
case HTML_N_TABLE:
CASE_TABLE_TAG:
fputs("</SELECT>", f1);
pre_mode = 0;
end_tag = 0;
goto proc_normal;
}
}
}
}
}
proc_normal:
if (is_tag) {
char *q = tok->ptr;
int j, a_target = 0;
struct parsed_tag *tag;
ParsedURL url;
if (!(tag = parse_tag(&q, FALSE)))
@@ -603,7 +669,7 @@ createFrameFile(struct frameset *f, FILE * f1, Buffer *current, int level,
case HTML_BASE:
/* "BASE" is prohibit tag */
if (parsedtag_get_value(tag, ATTR_HREF, &q)) {
q = url_quote_conv(q, code);
q = url_quote_conv(remove_space(q), code);
parseURL(q, &base, NULL);
}
if (parsedtag_get_value(tag, ATTR_TARGET, &q)) {
@@ -660,18 +726,7 @@ createFrameFile(struct frameset *f, FILE * f1, Buffer *current, int level,
goto token_end;
}
break;
case HTML_THEAD:
case HTML_N_THEAD:
case HTML_TBODY:
case HTML_N_TBODY:
case HTML_TFOOT:
case HTML_N_TFOOT:
case HTML_TD:
case HTML_N_TD:
case HTML_TR:
case HTML_N_TR:
case HTML_TH:
case HTML_N_TH:
CASE_TABLE_TAG:
/* table_tags MUST be in table stack */
if (!t_stack) {
Strshrinkfirst(tok, 1);
@@ -682,6 +737,37 @@ createFrameFile(struct frameset *f, FILE * f1, Buffer *current, int level,
}
break;
case HTML_SELECT:
pre_mode = RB_INSELECT;
end_tag = HTML_N_SELECT;
break;
case HTML_TEXTAREA:
pre_mode = RB_INTXTA;
end_tag = HTML_N_TEXTAREA;
break;
case HTML_SCRIPT:
pre_mode = RB_SCRIPT;
end_tag = HTML_N_SCRIPT;
break;
case HTML_STYLE:
pre_mode = RB_STYLE;
end_tag = HTML_N_STYLE;
break;
case HTML_LISTING:
pre_mode = RB_PLAIN;
end_tag = HTML_N_LISTING;
fputs("<PRE_PLAIN>", f1);
goto token_end;
case HTML_XMP:
pre_mode = RB_PLAIN;
end_tag = HTML_N_XMP;
fputs("<PRE_PLAIN>", f1);
goto token_end;
case HTML_PLAINTEXT:
pre_mode = RB_PLAIN;
end_tag = MAX_HTMLTAG;
fputs("<PRE_PLAIN>", f1);
goto token_end;
default:
break;
}
@@ -693,7 +779,8 @@ createFrameFile(struct frameset *f, FILE * f1, Buffer *current, int level,
if (!tag->value[j])
break;
tag->value[j] =
url_quote_conv(tag->value[j], code);
url_quote_conv(remove_space(tag->value[j]),
code);
parseURL2(tag->value[j], &url, &base);
if (url.scheme == SCM_UNKNOWN ||
#ifndef USE_W3MMAILER
@@ -748,11 +835,28 @@ createFrameFile(struct frameset *f, FILE * f1, Buffer *current, int level,
Strfputs(tok, f1);
}
else {
Strfputs(tok, f1);
if (pre_mode & (RB_PLAIN | RB_INTXTA))
fprintf(f1, "%s", html_quote(tok->ptr));
else
Strfputs(tok, f1);
}
token_end:
Strclear(tok);
} while (*p != '\0' || !iseos(f2.stream));
if (pre_mode & RB_PLAIN)
fputs("</PRE_PLAIN>\n", f1);
else if (pre_mode & RB_INTXTA)
fputs("</TEXTAREA></FORM>\n", f1);
else if (pre_mode & RB_INSELECT)
fputs("</SELECT></FORM>\n", f1);
else if (pre_mode & (RB_SCRIPT | RB_STYLE)) {
if (status != R_ST_NORMAL)
fputs(correct_irrtag(status)->ptr, f1);
if (pre_mode & RB_SCRIPT)
fputs("</SCRIPT>\n", f1);
else if (pre_mode & RB_STYLE)
fputs("</STYLE>\n", f1);
}
while (t_stack--)
fputs("</TABLE>\n", f1);
UFclose(&f2);