1 /* Boost.Flyweight example of flyweight-based formatted text processing.
3 * Copyright 2006-2014 Joaquin M Lopez Munoz.
4 * Distributed under the Boost Software License, Version 1.0.
5 * (See accompanying file LICENSE_1_0.txt or copy at
6 * http://www.boost.org/LICENSE_1_0.txt)
8 * See http://www.boost.org/libs/flyweight for library home page.
11 #include <boost/flyweight.hpp>
12 #include <boost/functional/hash.hpp>
23 #if defined(BOOST_NO_STDC_NAMESPACE)
24 namespace std
{using ::exit
;using ::tolower
;}
27 using namespace boost::flyweights
;
29 /* An HTML tag consists of a name and optional properties of the form
30 * name1=value1 ... namen=valuen. We do not need to parse the properties
31 * for the purposes of the program, hence they are all stored in
32 * html_tag_data::properties in raw form.
38 std::string properties
;
41 bool operator==(const html_tag_data
& x
,const html_tag_data
& y
)
43 return x
.name
==y
.name
&&x
.properties
==y
.properties
;
46 /* See the portability section of Boost.Hash at
47 * http://boost.org/doc/html/hash/portability.html
48 * for an explanation of the ADL-related workarounds.
51 #if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP)
55 std::size_t hash_value(const html_tag_data
& x
)
58 boost::hash_combine(res
,x
.name
);
59 boost::hash_combine(res
,x
.properties
);
63 #if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP)
64 } /* namespace boost */
67 typedef flyweight
<html_tag_data
> html_tag
;
69 /* parse_tag is passed an iterator positioned at the first char of
70 * the tag after the opening '<' and returns, if succesful, a parsed tag
71 * and whether it is opening (<xx>) or closing (</xx>).
74 enum tag_type
{opening
,closing
,failure
};
78 parse_tag_res(tag_type type_
,const html_tag_data
& tag_
=html_tag_data()):
79 type(type_
),tag(tag_
){}
80 parse_tag_res(const parse_tag_res
& x
):type(x
.type
),tag(x
.tag
){}
86 template<typename ForwardIterator
>
87 parse_tag_res
parse_tag(ForwardIterator
& first
,ForwardIterator last
)
92 for(ForwardIterator it
=first
;it
!=last
;){
94 if(ch
=='>'&&!in_quote
){ /* ignore '>'s if inside quotes */
96 std::string::size_type
97 bname
=buf
.find_first_not_of("\t\n\r "),
98 ename
=bname
==std::string::npos
?
100 buf
.find_first_of("\t\n\r ",bname
),
101 bprop
=ename
==std::string::npos
?
103 buf
.find_first_not_of("\t\n\r ",ename
);
104 if(bname
==ename
){ /* null name */
105 return parse_tag_res(failure
);
107 else if(buf
[bname
]=='/'){ /* closing tag */
112 tag
.name
=buf
.substr(bname
,ename
-bname
);
113 std::transform( /* normalize tag name to lower case */
114 tag
.name
.begin(),tag
.name
.end(),tag
.name
.begin(),
115 (int(*)(int))std::tolower
);
116 if(bprop
!=std::string::npos
){
117 tag
.properties
=buf
.substr(bprop
,buf
.size());
119 first
=it
; /* result good, consume the chars */
120 return parse_tag_res(type
,tag
);
123 if(ch
=='"')in_quote
=!in_quote
;
127 return parse_tag_res(failure
); /* end reached and found no '>' */
130 /* A character context is just a vector containing the tags enclosing the
131 * character, from the outermost level to the innermost.
134 typedef std::vector
<html_tag
> html_context_data
;
135 typedef flyweight
<html_context_data
> html_context
;
137 /* A character is a char code plus its context.
140 struct character_data
142 character_data(char code_
=0,html_context context_
=html_context()):
143 code(code_
),context(context_
){}
144 character_data(const character_data
& x
):code(x
.code
),context(x
.context
){}
147 html_context context
;
150 bool operator==(const character_data
& x
,const character_data
& y
)
152 return x
.code
==y
.code
&&x
.context
==y
.context
;
155 #if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP)
159 std::size_t hash_value(const character_data
& x
)
162 boost::hash_combine(res
,x
.code
);
163 boost::hash_combine(res
,x
.context
);
167 #if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP)
168 } /* namespace boost */
171 typedef flyweight
<character_data
> character
;
173 /* scan_html converts HTML code into a stream of contextualized characters.
176 template<typename ForwardIterator
,typename OutputIterator
>
177 void scan_html(ForwardIterator first
,ForwardIterator last
,OutputIterator out
)
179 html_context_data context
;
181 if(*first
=='<'){ /* tag found */
183 parse_tag_res res
=parse_tag(first
,last
);
184 if(res
.type
==opening
){ /* add to contex */
185 context
.push_back(res
.tag
);
188 else if(res
.type
==closing
){ /* remove from context */
189 /* Pop all tags from the innermost to the matching one; this takes
190 * care of missing </xx>s like vg. in <ul><li>hello</ul>.
193 for(html_context_data::reverse_iterator rit
=context
.rbegin();
194 rit
!=context
.rend();++rit
){
195 if(rit
->get().name
==res
.tag
.get().name
){
196 context
.erase(rit
.base()-1,context
.end());
203 *out
++=character(*first
++,html_context(context
));
207 /* HTML-producing utilities */
209 void print_opening_tag(std::ostream
& os
,const html_tag_data
& x
)
212 if(!x
.properties
.empty())os
<<" "<<x
.properties
;
216 void print_closing_tag(std::ostream
& os
,const html_tag_data
& x
)
218 /* SGML declarations (beginning with '!') are not closed */
220 if(x
.name
[0]!='!')os
<<"</"<<x
.name
<<">";
223 /* change_context takes contexts from and to with tags
225 * from<- c1 ... cn fn+1 ... fm
226 * to <- c1 ... cn tn+1 ... tk
228 * (that is, they share the first n tags, n might be 0), and
229 * produces code closing fm ... fn+1 and opening tn+1 ... tk.
232 template<typename OutputIterator
>
234 const html_context_data
& from
,const html_context_data
& to
,
237 std::ostringstream oss
;
238 html_context_data::const_iterator
243 for(;it0
!=it0_end
&&it1
!=it1_end
&&*it0
==*it1
;++it0
,++it1
);
244 while(it0_end
!=it0
)print_closing_tag(oss
,*--it0_end
);
245 while(it1
!=it1_end
)print_opening_tag(oss
,*it1
++);
246 std::string str
=oss
.str();
247 std::copy(str
.begin(),str
.end(),out
);
250 /* produce_html is passed a bunch of contextualized characters and emits
251 * the corresponding HTML. The algorithm is simple: tags are opened and closed
252 * as a result of the context from one character to the following changing.
255 template<typename ForwardIterator
,typename OutputIterator
>
256 void produce_html(ForwardIterator first
,ForwardIterator last
,OutputIterator out
)
258 html_context context
;
260 if(first
->get().context
!=context
){
261 change_context(context
,first
->get().context
,out
);
262 context
=first
->get().context
;
264 *out
++=(first
++)->get().code
;
266 change_context(context
,html_context(),out
); /* close remaining context */
269 /* Without these explicit instantiations, MSVC++ 6.5/7.0 does not
270 * find some friend operators in certain contexts.
278 std::cout
<<"input html file: ";
280 std::getline(std::cin
,in
);
281 std::ifstream
ifs(in
.c_str());
283 std::cout
<<"can't open "<<in
<<std::endl
;
284 std::exit(EXIT_FAILURE
);
286 typedef std::istreambuf_iterator
<char> istrbuf_iterator
;
287 std::vector
<char> html_source
;
289 istrbuf_iterator(ifs
),istrbuf_iterator(),
290 std::back_inserter(html_source
));
294 std::vector
<character
> scanned_html
;
296 html_source
.begin(),html_source
.end(),std::back_inserter(scanned_html
));
298 /* Now that we have the text as a vector of contextualized characters,
299 * we can shuffle it around and manipulate in almost any way we please.
300 * For instance, the following reverses the central portion of the doc.
304 scanned_html
.begin()+scanned_html
.size()/4,
305 scanned_html
.begin()+3*(scanned_html
.size()/4));
307 /* emit the resulting HTML */
309 std::cout
<<"output html file: ";
311 std::getline(std::cin
,out
);
312 std::ofstream
ofs(out
.c_str());
314 std::cout
<<"can't open "<<out
<<std::endl
;
315 std::exit(EXIT_FAILURE
);
317 typedef std::ostreambuf_iterator
<char> ostrbuf_iterator
;
318 produce_html(scanned_html
.begin(),scanned_html
.end(),ostrbuf_iterator(ofs
));