-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHtmlPreprocesser.cpp
159 lines (145 loc) · 5.08 KB
/
HtmlPreprocesser.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#include <cctype>
#include "HtmlPreprocesser.h"
using namespace std;
// get title between <title ...> ... </title ...> in the html contentStr.
void HtmlPreprocesser::getTitle(const string &contentStr, string &title)
{
const string titleBegTag = "<title";
const string titleEndTag = "</title";
title.clear();
for(size_t i = 0; i < contentStr.size(); ++i) {
if(imatchHere(contentStr, i, titleBegTag)) {
for(i += titleBegTag.size(); i < contentStr.size() && contentStr[i] != '>'; ++i)
;
for(i += 1; i < contentStr.size() && !imatchHere(contentStr, i, titleEndTag); ++i)
title += contentStr[i];
break;
}
}
}
// case insensitive match here
bool HtmlPreprocesser::imatchHere(const string &str, int i, const string &pat)
{
size_t j;
for(j = 0; j + i < str.size() && j < pat.size()
&& tolower(str[j+i]) == tolower(pat[j]); ++j)
;
return j + i <= str.size() && j == pat.size();
}
// delete the content between <tag ...> and </tag ...> including the tags in str
bool HtmlPreprocesser::delBetweenHtmlTags(const string & str, const string tag, string &ret)
{
string tagBeg = "<" + tag;
string tagEnd = "</" + tag;
ret.clear();
ret.reserve(str.size());
int nTagBeg = 0;
for(size_t i = 0; i < str.size(); ++i) {
if(imatchHere(str, i, tagBeg) && nTagBeg == 0) {
++nTagBeg;
for(i += tagBeg.size(); i < str.size() && str[i] != '>'; ++i)
;
}
if(nTagBeg > 0) {
if(imatchHere(str, i, tagEnd)) {
--nTagBeg;
for(i += tagEnd.size(); i < str.size() && str[i] != '>'; ++i)
;
}
}
else
ret += str[i];
}
return nTagBeg == 0;
}
// delete all the comments between <!-- and --> in str
bool HtmlPreprocesser::delComments(const string &str, string &ret)
{
string tagBeg = "<!--";
string tagEnd = "-->";
ret.clear();
ret.reserve(str.size());
bool inComment = false;
for(size_t i = 0; i < str.size(); ++i) {
if(!inComment && str.compare(i, tagBeg.size(), tagBeg) == 0)
inComment = true;
if(inComment) {
if(str.compare(i, tagEnd.size(), tagEnd) == 0) {
i += tagEnd.size() - 1;
inComment = false;
}
}
else
ret += str[i];
}
return !inComment;
}
// delete all the content in the < > including < and >
// < and > must appear in pairs and cann't be nested
bool HtmlPreprocesser::delTags(const string &str, string &ret)
{
ret.clear();
ret.reserve(str.size());
int nTagBeg = 0;
for(size_t i = 0; i < str.size(); ++i) {
if(str[i] == '<' && i + 1 < str.size() && !isspace(str[i+1]) && nTagBeg == 0) {
++nTagBeg;
}
if(nTagBeg > 0) {
if(str[i] == '>') {
--nTagBeg;
}
}
else
ret += str[i];
}
return nTagBeg == 0;
}
// replace every src for dest in str
void HtmlPreprocesser::replaceStr(const string &str, const string src, const string dest, string &ret)
{
ret.clear();
ret.reserve(str.size());
if(src.empty())
return;
for(size_t i = 0; i < str.size(); ) {
if(str.compare(i, src.size(), src) == 0) {
ret += dest;
i += src.size();
}
else
ret += str[i++];
}
}
void HtmlPreprocesser::delOddchar(const string &str, const string dest, string &ret)
{
ret.clear();
ret.reserve(str.size());
for(size_t i = 0; i < str.size(); i++) {
if((str[i] >= 'A' && str[i] <= 'Z') || (str[i] >= 'a' && str[i] <= 'z')) {
ret += str[i];
}
else
ret += dest;
}
}
// merge two or more spaces into one
void HtmlPreprocesser::mergeSpaces(const string &str, string &ret)
{
ret.clear();
ret.reserve(str.size());
bool onSpace = false;
for(size_t i = 0; i < str.size(); ++i) {
if(isspace(str[i])) {
if(onSpace)
continue;
else
onSpace = true;
}
else{
onSpace = false;
}
ret += str[i];
}
return;
}