htmlparser.js
4.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
/**
*
* htmlParser改造自: https://github.com/blowsie/Pure-JavaScript-HTML5-Parser
*
* author: Di (微信小程序开发工程师)
* organization: WeAppDev(微信小程序开发论坛)(http://weappdev.com)
* 垂直微信小程序开发交流社区
*
* github地址: https://github.com/icindy/wxParse
*
* for: 微信小程序富文本解析
* detail : http://weappdev.com/t/wxparse-alpha0-1-html-markdown/184
*/
// Regular Expressions for parsing tags and attributes
const startTag = /^<([-A-Za-z0-9_]+)((?:\s+[a-zA-Z0-9_:][-a-zA-Z0-9_:.]*(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>/;
const endTag = /^<\/([-A-Za-z0-9_]+)[^>]*>/;
const attr = /([a-zA-Z0-9_:][-a-zA-Z0-9_:.]*)(?:\s*=\s*(?:(?:"((?:\\.|[^"])*)")|(?:'((?:\\.|[^'])*)')|([^>\s]+)))?/g;
function makeMap(str) {
const obj = {};
const items = str.split(',');
for (let i = 0; i < items.length; i += 1) obj[items[i]] = true;
return obj;
}
// Empty Elements - HTML 5
const empty = makeMap('area,base,basefont,br,col,frame,hr,img,input,link,meta,param,embed,command,keygen,source,track,wbr');
// Block Elements - HTML 5
const block = makeMap('address,code,article,applet,aside,audio,blockquote,button,canvas,center,dd,del,dir,div,dl,dt,fieldset,figcaption,figure,footer,form,frameset,h1,h2,h3,h4,h5,h6,header,hgroup,hr,iframe,ins,isindex,li,map,menu,noframes,noscript,object,ol,output,p,pre,section,script,table,tbody,td,tfoot,th,thead,tr,ul,video');
// Inline Elements - HTML 5
const inline = makeMap('a,abbr,acronym,applet,b,basefont,bdo,big,br,button,cite,del,dfn,em,font,i,iframe,img,input,ins,kbd,label,map,object,q,s,samp,script,select,small,span,strike,strong,sub,sup,textarea,tt,u,var');
// Elements that you can, intentionally, leave open
// (and which close themselves)
const closeSelf = makeMap('colgroup,dd,dt,li,options,p,td,tfoot,th,thead,tr');
// Attributes that have their values filled in disabled="disabled"
const fillAttrs = makeMap('checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected');
function HTMLParser(html, handler) {
let index;
let chars;
let match;
let last = html;
const stack = [];
stack.last = () => stack[stack.length - 1];
function parseEndTag(tag, tagName) {
// If no tag name is provided, clean shop
let pos;
if (!tagName) {
pos = 0;
} else {
// Find the closest opened tag of the same type
tagName = tagName.toLowerCase();
for (pos = stack.length - 1; pos >= 0; pos -= 1) {
if (stack[pos] === tagName) break;
}
}
if (pos >= 0) {
// Close all the open elements, up the stack
for (let i = stack.length - 1; i >= pos; i -= 1) {
if (handler.end) handler.end(stack[i]);
}
// Remove the open elements from the stack
stack.length = pos;
}
}
function parseStartTag(tag, tagName, rest, unary) {
tagName = tagName.toLowerCase();
if (block[tagName]) {
while (stack.last() && inline[stack.last()]) {
parseEndTag('', stack.last());
}
}
if (closeSelf[tagName] && stack.last() === tagName) {
parseEndTag('', tagName);
}
unary = empty[tagName] || !!unary;
if (!unary) stack.push(tagName);
if (handler.start) {
const attrs = [];
rest.replace(attr, function genAttr(matches, name) {
const value = arguments[2] || arguments[3] || arguments[4] || (fillAttrs[name] ? name : '');
attrs.push({
name,
value,
escaped: value.replace(/(^|[^\\])"/g, '$1\\"'), // "
});
});
if (handler.start) {
handler.start(tagName, attrs, unary);
}
}
}
while (html) {
chars = true;
if (html.indexOf('</') === 0) {
match = html.match(endTag);
if (match) {
html = html.substring(match[0].length);
match[0].replace(endTag, parseEndTag);
chars = false;
}
// start tag
} else if (html.indexOf('<') === 0) {
match = html.match(startTag);
if (match) {
html = html.substring(match[0].length);
match[0].replace(startTag, parseStartTag);
chars = false;
}
}
if (chars) {
index = html.indexOf('<');
let text = '';
while (index === 0) {
text += '<';
html = html.substring(1);
index = html.indexOf('<');
}
text += index < 0 ? html : html.substring(0, index);
html = index < 0 ? '' : html.substring(index);
if (handler.chars) handler.chars(text);
}
if (html === last) throw new Error(`Parse Error: ${html}`);
last = html;
}
// Clean up any remaining tags
parseEndTag();
}
export default HTMLParser;