ceph/src/boost/libs/flyweight/example/html.cpp

   1 /* Boost.Flyweight example of flyweight-based formatted text processing.
   2  *
   3  * Copyright 2006-2014 Joaquin M Lopez Munoz.
   4  * Distributed under the Boost Software License, Version 1.0.
   5  * (See accompanying file LICENSE_1_0.txt or copy at
   6  * http://www.boost.org/LICENSE_1_0.txt)
   7  *
   8  * See http://www.boost.org/libs/flyweight for library home page.
   9  */
  10
  11 #include <boost/flyweight.hpp>
  12 #include <boost/functional/hash.hpp>
  13 #include <algorithm>
  14 #include <cctype>
  15 #include <cstdio>
  16 #include <fstream>
  17 #include <iostream>
  18 #include <iterator>
  19 #include <sstream>
  20 #include <string>
  21 #include <vector>
  22
  23 #if defined(BOOST_NO_STDC_NAMESPACE)
  24 namespace std{using ::exit;using ::tolower;}
  25 #endif
  26
  27 using namespace boost::flyweights;
  28
  29 /* An HTML tag consists of a name and optional properties of the form
  30  * name1=value1 ... namen=valuen. We do not need to parse the properties
  31  * for the purposes of the program, hence they are all stored in
  32  * html_tag_data::properties in raw form.
  33  */
  34
  35 struct html_tag_data
  36 {
  37   std::string name;
  38   std::string properties;
  39 };
  40
  41 bool operator==(const html_tag_data& x,const html_tag_data& y)
  42 {
  43   return x.name==y.name&&x.properties==y.properties;
  44 }
  45
  46 /* See the portability section of Boost.Hash at
  47  *   http://boost.org/doc/html/hash/portability.html
  48  * for an explanation of the ADL-related workarounds.
  49  */
  50
  51 #if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP)
  52 namespace boost{
  53 #endif
  54
  55 std::size_t hash_value(const html_tag_data& x)
  56 {
  57   std::size_t res=0;
  58   boost::hash_combine(res,x.name);
  59   boost::hash_combine(res,x.properties);
  60   return res;
  61 }
  62
  63 #if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP)
  64 } /* namespace boost */
  65 #endif
  66
  67 typedef flyweight<html_tag_data> html_tag;
  68
  69 /* parse_tag is passed an iterator positioned at the first char of
  70  * the tag after the opening '<' and returns, if succesful, a parsed tag
  71  * and whether it is opening (<xx>) or closing (</xx>).
  72  */
  73
  74 enum tag_type{opening,closing,failure};
  75
  76 struct parse_tag_res
  77 {
  78   parse_tag_res(tag_type type_,const html_tag_data& tag_=html_tag_data()):
  79     type(type_),tag(tag_){}
  80   parse_tag_res(const parse_tag_res& x):type(x.type),tag(x.tag){}
  81
  82   tag_type type;
  83   html_tag tag;
  84 };
  85
  86 template<typename ForwardIterator>
  87 parse_tag_res parse_tag(ForwardIterator& first,ForwardIterator last)
  88 {
  89   html_tag_data  tag;
  90   std::string    buf;
  91   bool           in_quote=false;
  92   for(ForwardIterator it=first;it!=last;){
  93     char ch=*it++;
  94     if(ch=='>'&&!in_quote){             /* ignore '>'s if inside quotes */
  95       tag_type type;
  96       std::string::size_type
  97         bname=buf.find_first_not_of("\t\n\r "),
  98         ename=bname==std::string::npos?
  99           std::string::npos:
 100           buf.find_first_of("\t\n\r ",bname),
 101         bprop=ename==std::string::npos?
 102           std::string::npos:
 103           buf.find_first_not_of("\t\n\r ",ename);
 104       if(bname==ename){                 /* null name */
 105         return parse_tag_res(failure);
 106       }
 107       else if(buf[bname]=='/'){         /* closing tag */
 108         type=closing;
 109         ++bname;
 110       }
 111       else type=opening;
 112       tag.name=buf.substr(bname,ename-bname);
 113       std::transform(                   /* normalize tag name to lower case */
 114         tag.name.begin(),tag.name.end(),tag.name.begin(),
 115         (int(*)(int))std::tolower);
 116       if(bprop!=std::string::npos){
 117         tag.properties=buf.substr(bprop,buf.size());
 118       }
 119       first=it;                         /* result good, consume the chars */
 120       return parse_tag_res(type,tag);
 121     }
 122     else{
 123       if(ch=='"')in_quote=!in_quote;
 124       buf+=ch;
 125     }
 126   }
 127   return parse_tag_res(failure);        /* end reached and found no '>' */
 128 }
 129
 130 /* A character context is just a vector containing the tags enclosing the
 131  * character, from the outermost level to the innermost.
 132  */
 133
 134 typedef std::vector<html_tag>        html_context_data;
 135 typedef flyweight<html_context_data> html_context;
 136
 137 /* A character is a char code plus its context.
 138  */
 139
 140 struct character_data
 141 {
 142   character_data(char code_=0,html_context context_=html_context()):
 143     code(code_),context(context_){}
 144   character_data(const character_data& x):code(x.code),context(x.context){}
 145
 146   char         code;
 147   html_context context;
 148 };
 149
 150 bool operator==(const character_data& x,const character_data& y)
 151 {
 152   return x.code==y.code&&x.context==y.context;
 153 }
 154
 155 #if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP)
 156 namespace boost{
 157 #endif
 158
 159 std::size_t hash_value(const character_data& x)
 160 {
 161   std::size_t res=0;
 162   boost::hash_combine(res,x.code);
 163   boost::hash_combine(res,x.context);
 164   return res;
 165 }
 166
 167 #if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP)
 168 } /* namespace boost */
 169 #endif
 170
 171 typedef flyweight<character_data> character;
 172
 173 /* scan_html converts HTML code into a stream of contextualized characters.
 174  */
 175
 176 template<typename ForwardIterator,typename OutputIterator>
 177 void scan_html(ForwardIterator first,ForwardIterator last,OutputIterator out)
 178 {
 179   html_context_data context;
 180   while(first!=last){
 181     if(*first=='<'){                                 /* tag found */
 182       ++first;
 183       parse_tag_res res=parse_tag(first,last);
 184       if(res.type==opening){                         /* add to contex */
 185         context.push_back(res.tag);
 186         continue;
 187       }
 188       else if(res.type==closing){                    /* remove from context */
 189         /* Pop all tags from the innermost to the matching one; this takes
 190          * care of missing </xx>s like vg. in <ul><li>hello</ul>.
 191          */
 192
 193         for(html_context_data::reverse_iterator rit=context.rbegin();
 194             rit!=context.rend();++rit){
 195           if(rit->get().name==res.tag.get().name){
 196             context.erase(rit.base()-1,context.end());
 197             break;
 198           }
 199         }
 200         continue;
 201       }
 202     }
 203     *out++=character(*first++,html_context(context));
 204   }
 205 }
 206
 207 /* HTML-producing utilities */
 208
 209 void print_opening_tag(std::ostream& os,const html_tag_data& x)
 210 {
 211   os<<"<"<<x.name;
 212   if(!x.properties.empty())os<<" "<<x.properties;
 213   os<<">";
 214 }
 215
 216 void print_closing_tag(std::ostream& os,const html_tag_data& x)
 217 {
 218   /* SGML declarations (beginning with '!') are not closed */
 219
 220   if(x.name[0]!='!')os<<"</"<<x.name<<">";
 221 }
 222
 223 /* change_context takes contexts from and to with tags
 224  *
 225  *   from<- c1 ... cn fn+1 ... fm
 226  *   to  <- c1 ... cn tn+1 ... tk
 227  *
 228  * (that is, they share the first n tags, n might be 0), and
 229  * produces code closing fm ... fn+1 and opening tn+1 ... tk.
 230  */
 231
 232 template<typename OutputIterator>
 233 void change_context(
 234   const html_context_data& from,const html_context_data& to,
 235   OutputIterator out)
 236 {
 237   std::ostringstream oss;
 238   html_context_data::const_iterator
 239     it0=from.begin(),
 240     it0_end=from.end(),
 241     it1=to.begin(),
 242     it1_end=to.end();
 243   for(;it0!=it0_end&&it1!=it1_end&&*it0==*it1;++it0,++it1);
 244   while(it0_end!=it0)print_closing_tag(oss,*--it0_end);
 245   while(it1!=it1_end)print_opening_tag(oss,*it1++);
 246   std::string str=oss.str();
 247   std::copy(str.begin(),str.end(),out);
 248 }
 249
 250 /* produce_html is passed a bunch of contextualized characters and emits
 251  * the corresponding HTML. The algorithm is simple: tags are opened and closed
 252  * as a result of the context from one character to the following changing.
 253  */
 254
 255 template<typename ForwardIterator,typename OutputIterator>
 256 void produce_html(ForwardIterator first,ForwardIterator last,OutputIterator out)
 257 {
 258   html_context context;
 259   while(first!=last){
 260     if(first->get().context!=context){
 261       change_context(context,first->get().context,out);
 262       context=first->get().context;
 263     }
 264     *out++=(first++)->get().code;
 265   }
 266   change_context(context,html_context(),out); /* close remaining context */
 267 }
 268
 269 /* Without these explicit instantiations, MSVC++ 6.5/7.0 does not
 270  * find some friend operators in certain contexts.
 271  */
 272
 273 character dummy1;
 274 html_tag  dummy2;
 275
 276 int main()
 277 {
 278   std::cout<<"input html file: ";
 279   std::string in;
 280   std::getline(std::cin,in);
 281   std::ifstream ifs(in.c_str());
 282   if(!ifs){
 283     std::cout<<"can't open "<<in<<std::endl;
 284     std::exit(EXIT_FAILURE);
 285   }
 286   typedef std::istreambuf_iterator<char> istrbuf_iterator;
 287   std::vector<char> html_source;
 288   std::copy(
 289     istrbuf_iterator(ifs),istrbuf_iterator(),
 290     std::back_inserter(html_source));
 291
 292   /* parse the HTML */
 293
 294   std::vector<character> scanned_html;
 295   scan_html(
 296     html_source.begin(),html_source.end(),std::back_inserter(scanned_html));
 297
 298   /* Now that we have the text as a vector of contextualized characters,
 299    * we can shuffle it around and manipulate in almost any way we please.
 300    * For instance, the following reverses the central portion of the doc.
 301    */
 302
 303   std::reverse(
 304     scanned_html.begin()+scanned_html.size()/4,
 305     scanned_html.begin()+3*(scanned_html.size()/4));
 306
 307   /* emit the resulting HTML */
 308
 309   std::cout<<"output html file: ";
 310   std::string out;
 311   std::getline(std::cin,out);
 312   std::ofstream ofs(out.c_str());
 313   if(!ofs){
 314     std::cout<<"can't open "<<out<<std::endl;
 315     std::exit(EXIT_FAILURE);
 316   }
 317   typedef std::ostreambuf_iterator<char> ostrbuf_iterator;
 318   produce_html(scanned_html.begin(),scanned_html.end(),ostrbuf_iterator(ofs));
 319
 320   return 0;
 321 }