-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparse-gumbo.c
More file actions
75 lines (65 loc) · 1.86 KB
/
parse-gumbo.c
File metadata and controls
75 lines (65 loc) · 1.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
/* SPDX-License-Identifier: MIT */
/* SPDX-FileCopyrightText: (c) Copyright 2024 Andrew Bower <andrew@bower.uk> */
/* parse tag soup AKA HTML5
*
* Uses libgumbo to parse the HTML, walks the resultant tree and renders
* only text content. The output is in UTF-8.
*/
#include <uchar.h>
#include <gumbo.h>
#include "config.h"
#include "render.h"
#include "parse-gumbo.h"
static void walk_tree(GumboNode *node) {
/* By default, neither render content nor descend tree further */
GumboVector *children = nullptr;
GumboText *text = nullptr;
const char8_t *tag = nullptr;
const struct render_elem *rendering = nullptr;
switch (node->type) {
case GUMBO_NODE_CDATA:
if (!opt.cdata_is_comment || opt.comment)
text = &node->v.text;
break;
case GUMBO_NODE_COMMENT:
if (opt.comment)
text = &node->v.text;
break;
case GUMBO_NODE_TEXT:
case GUMBO_NODE_WHITESPACE:
text = &node->v.text;
break;
case GUMBO_NODE_DOCUMENT:
children = &node->v.document.children;
break;
case GUMBO_NODE_ELEMENT:
children = &node->v.element.children;
if (node->v.element.tag < GUMBO_TAG_UNKNOWN) {
tag = (char8_t *) gumbo_normalized_tagname(node->v.element.tag);
rendering = get_rendering(tag);
}
break;
default:
/* Do nothing */
}
if (text)
render_text((char8_t *) text->text);
if (children) {
render_element(tag, false, rendering);
if (!rendering || !rendering->skip)
for (int child = 0; child < children->length; child++)
walk_tree((GumboNode *) children->data[child]);
render_element(tag, true, rendering);
}
}
int parse_tagsoup(struct mapped_buffer *input) {
GumboOutput *doc;
doc = gumbo_parse(input->data);
if (doc) {
walk_tree(doc->root);
gumbo_destroy_output(&kGumboDefaultOptions, doc);
} else {
fprintf(stderr, "html parsing failed\n");
}
return 0;
}