Proper raw html handling

This commit is contained in:
Nate Choe
2022-05-11 12:17:37 -05:00
parent cda87f2ac0
commit 9d0d998b71
3 changed files with 33 additions and 69 deletions

View File

@@ -47,7 +47,7 @@ struct linedata {
enum linetype type;
union {
int intensity;
int isfirst;
int islast;
} data;
};

View File

@@ -40,8 +40,6 @@ static int parseline(char *line, struct parsestate *currstate, FILE *out);
static int endpara(struct parsestate *state, FILE *out);
static void handlehtmlcase(struct linedata *data, struct parsestate *state,
char *line, FILE *out);
static void handlehtmlmiddle(struct linedata *data, struct parsestate *state,
char *line, FILE *out);
int parsemarkdown(FILE *infile, FILE *outfile) {
struct linefile *realin;
@@ -88,25 +86,25 @@ static int parseline(char *line, struct parsestate *currstate, FILE *out) {
currstate->isfirst = 0;
return 0;
case HTMLCONCRETE:
handlehtmlmiddle(&type, currstate, line, out);
handlehtmlcase(&type, currstate, line, out);
return 0;
case COMMENTLONG:
handlehtmlmiddle(&type, currstate, line, out);
handlehtmlcase(&type, currstate, line, out);
return 0;
case PHP:
handlehtmlmiddle(&type, currstate, line, out);
handlehtmlcase(&type, currstate, line, out);
return 0;
case COMMENTSHORT:
handlehtmlmiddle(&type, currstate, line, out);
handlehtmlcase(&type, currstate, line, out);
return 0;
case CDATA:
handlehtmlmiddle(&type, currstate, line, out);
handlehtmlcase(&type, currstate, line, out);
return 0;
case SKELETON:
handlehtmlmiddle(&type, currstate, line, out);
handlehtmlcase(&type, currstate, line, out);
return 0;
case GENERICTAG:
handlehtmlmiddle(&type, currstate, line, out);
handlehtmlcase(&type, currstate, line, out);
return 0;
case EMPTY: case PLAIN: case SPACECODE: case HR:
case SETEXT1: case SETEXT2: case HEADER:
@@ -157,7 +155,7 @@ static int parseline(char *line, struct parsestate *currstate, FILE *out) {
* Should NOT compile to this:
<p>Chapter 1</p><hr>
<p>Chapter 1</p><hr />
* but rather to this
@@ -249,14 +247,8 @@ static void handlehtmlcase(struct linedata *data, struct parsestate *state,
fputs(line, out);
fputc('\n', out);
state->prev.type = data->type;
}
static void handlehtmlmiddle(struct linedata *data, struct parsestate *state,
char *line, FILE *out) {
if (state->prev.type == data->type && !data->data.isfirst) {
if (state->prev.type == data->type && data->data.islast) {
state->prev.type = EMPTY;
return;
}
fputs(line, out);
fputc('\n', out);
}

View File

@@ -25,7 +25,7 @@
static char *truncate(char *str);
static char *after(char *begin, char *str);
static void identifyend(char *line, enum linetype prev, struct linedata *ret);
static int isend(char *line, enum linetype prev);
static char *concretetags[] = { "pre", "script", "style", "textarea" };
static char *skeletontags[] = {
@@ -42,7 +42,8 @@ static char *skeletontags[] = {
void identifyline(char *line, struct linedata *prev, struct linedata *ret) {
int i;
if (HTMLSTART <= prev->type && prev->type <= HTMLEND) {
identifyend(truncate(line), prev->type, ret);
ret->type = prev->type;
ret->data.islast = isend(truncate(line), prev->type);
return;
}
if (prev->type != PLAIN) {
@@ -117,7 +118,7 @@ notheader:
#define HTMLSTARTCASE(start, rettype) \
if (after(start, line) != NULL) { \
ret->type = rettype; \
ret->data.isfirst = 1; \
ret->data.islast = isend(line, rettype); \
return; \
}
HTMLSTARTCASE("<!--", COMMENTLONG);
@@ -128,6 +129,8 @@ notheader:
if (line[0] == '<') {
char *testline;
testline = line + 1;
if (testline[0] == '/')
++testline;
for (i = 0; i < LEN(concretetags); ++i) {
char *aftertag;
aftertag = after(concretetags[i], testline);
@@ -135,35 +138,33 @@ notheader:
continue;
if (aftertag[0] == '\0' || strchr(" >", aftertag[0])) {
ret->type = HTMLCONCRETE;
ret->data.isfirst = 1;
ret->data.islast = 0;
return;
}
}
if (testline[0] == '/')
++testline;
for (i = 0; i < LEN(skeletontags); ++i) {
char *aftertag;
aftertag = after(skeletontags[i], testline);
if (aftertag == NULL)
continue;
if (aftertag[0] == '\0' ||
strchr(" >", aftertag[0]) ||
strchr(" \t>", aftertag[0]) ||
after("/>", aftertag) != NULL) {
ret->type = SKELETON;
ret->data.isfirst = 1;
ret->data.islast = 0;
return;
}
}
if (isgenerictag(line)) {
ret->type = GENERICTAG;
ret->data.isfirst = 1;
ret->data.islast = 0;
return;
}
}
ret->type = PLAIN;
ret->data.isfirst = 1;
ret->data.islast = 0;
return;
}
@@ -275,61 +276,32 @@ static char *after(char *begin, char *str) {
return str;
}
static void identifyend(char *line, enum linetype prev, struct linedata *ret) {
static int isend(char *line, enum linetype prev) {
int i;
ret->type = EMPTY;
switch (prev) {
case EMPTY: case PLAIN: case SPACECODE: case FENCECODE: case HR:
case SETEXT1: case SETEXT2: case HEADER:
return;
return 1;
/* In this case, something has gone terribly wrong. */
case HTMLCONCRETE:
for (i = 0; i < LEN(concretetags); ++i) {
char endtag[30];
sprintf(endtag, "</%s>", concretetags[i]);
if (strstr(line, endtag) != NULL) {
ret->type = HTMLCONCRETE;
ret->data.isfirst = 0;
return;
}
return strstr(line, endtag) != NULL;
}
return;
return 0;
case COMMENTLONG:
if (strstr(line, "-->") != NULL) {
ret->type = COMMENTLONG;
ret->data.isfirst = 0;
}
return;
return strstr(line, "-->") != NULL;
case PHP:
if (strstr(line, "?>") != NULL) {
ret->type = PHP;
ret->data.isfirst = 0;
}
return;
return strstr(line, "?>") != NULL;
case COMMENTSHORT:
if (strchr(line, '>') != NULL) {
ret->type = COMMENTSHORT;
ret->data.isfirst = 0;
}
return;
return strchr(line, '>') != NULL;
case CDATA:
if (strstr(line, "]]>") != NULL) {
ret->type = CDATA;
ret->data.isfirst = 0;
}
return;
case SKELETON:
if (line[0] == '\0') {
ret->type = SKELETON;
ret->data.isfirst = 0;
}
return;
case GENERICTAG:
if (line[0] == '\0') {
ret->type = GENERICTAG;
ret->data.isfirst = 0;
}
return;
return strstr(line, "]]>") != NULL;
case SKELETON: case GENERICTAG:
return line[0] == '\0';
}
return 1;
}