]> git.proxmox.com Git - ceph.git/blame - ceph/src/boost/libs/beast/example/http/client/crawl/http_crawl.cpp
bump version to 18.2.2-pve1
[ceph.git] / ceph / src / boost / libs / beast / example / http / client / crawl / http_crawl.cpp
CommitLineData
b32b8144 1//
92f5a8d4 2// Copyright (c) 2016-2019 Vinnie Falco (vinnie dot falco at gmail dot com)
b32b8144
FG
3//
4// Distributed under the Boost Software License, Version 1.0. (See accompanying
5// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
6//
7// Official repository: https://github.com/boostorg/beast
8//
9
10//------------------------------------------------------------------------------
11//
12// Example: HTTP crawl (asynchronous)
13//
14//------------------------------------------------------------------------------
15
16#include "urls_large_data.hpp"
17
18#include <boost/beast/core.hpp>
19#include <boost/beast/http.hpp>
20#include <boost/beast/version.hpp>
21#include <boost/asio/bind_executor.hpp>
22#include <boost/asio/connect.hpp>
23#include <boost/asio/ip/tcp.hpp>
24#include <boost/asio/post.hpp>
25#include <boost/asio/strand.hpp>
26#include <atomic>
27#include <chrono>
28#include <cstdlib>
29#include <functional>
30#include <iomanip>
31#include <iostream>
32#include <memory>
33#include <string>
34#include <thread>
35#include <vector>
36#include <map>
37
b32b8144 38namespace chrono = std::chrono; // from <chrono>
92f5a8d4
TL
39namespace beast = boost::beast; // from <boost/beast.hpp>
40namespace http = beast::http; // from <boost/beast/http.hpp>
41namespace net = boost::asio; // from <boost/asio.hpp>
42using tcp = net::ip::tcp; // from <boost/asio/ip/tcp.hpp>
b32b8144
FG
43
44//------------------------------------------------------------------------------
45
46// This structure aggregates statistics on all the sites
47class crawl_report
48{
92f5a8d4
TL
49 net::strand<
50 net::io_context::executor_type> strand_;
b32b8144
FG
51 std::atomic<std::size_t> index_;
52 std::vector<char const*> const& hosts_;
53 std::size_t count_ = 0;
54
55public:
92f5a8d4 56 crawl_report(net::io_context& ioc)
20effc67 57 : strand_(ioc.get_executor())
b32b8144
FG
58 , index_(0)
59 , hosts_(urls_large_data())
60 {
61 }
62
63 // Run an aggregation function on the strand.
64 // This allows synchronization without a mutex.
65 template<class F>
66 void
67 aggregate(F const& f)
68 {
92f5a8d4 69 net::post(
b32b8144
FG
70 strand_,
71 [&, f]
72 {
73 f(*this);
74 if(count_ % 100 == 0)
75 {
76 std::cerr <<
77 "Progress: " << count_ << " of " << hosts_.size() << "\n";
78 //std::cerr << *this;
79 }
80 ++count_;
81 });
82 }
83
84 // Returns the next host to check
85 char const*
86 get_host()
87 {
88 auto const n = index_++;
89 if(n >= hosts_.size())
90 return nullptr;
91 return hosts_[n];
92 }
93
94 // Counts the number of timer failures
95 std::size_t timer_failures = 0;
96
97 // Counts the number of name resolution failures
98 std::size_t resolve_failures = 0;
99
100 // Counts the number of connection failures
101 std::size_t connect_failures = 0;
102
103 // Counts the number of write failures
104 std::size_t write_failures = 0;
105
106 // Counts the number of read failures
107 std::size_t read_failures = 0;
108
109 // Counts the number of success reads
110 std::size_t success = 0;
111
112 // Counts the number received of each status code
113 std::map<unsigned, std::size_t> status_codes;
114};
115
116std::ostream&
117operator<<(std::ostream& os, crawl_report const& report)
118{
119 // Print the report
120 os <<
121 "Crawl report\n" <<
122 " Failure counts\n" <<
123 " Timer : " << report.timer_failures << "\n" <<
124 " Resolve : " << report.resolve_failures << "\n" <<
125 " Connect : " << report.connect_failures << "\n" <<
126 " Write : " << report.write_failures << "\n" <<
127 " Read : " << report.read_failures << "\n" <<
128 " Success : " << report.success << "\n" <<
129 " Status codes\n"
130 ;
131 for(auto const& result : report.status_codes)
132 os <<
133 " " << std::setw(3) << result.first << ": " << result.second <<
134 " (" << http::obsolete_reason(static_cast<http::status>(result.first)) << ")\n";
135 os.flush();
136 return os;
137}
138
139//------------------------------------------------------------------------------
140
141// Performs HTTP GET requests and aggregates the results into a report
142class worker : public std::enable_shared_from_this<worker>
143{
144 enum
145 {
146 // Use a small timeout to keep things lively
147 timeout = 5
148 };
149
150 crawl_report& report_;
20effc67 151 net::strand<net::io_context::executor_type> ex_;
b32b8144 152 tcp::resolver resolver_;
92f5a8d4
TL
153 beast::tcp_stream stream_;
154 beast::flat_buffer buffer_; // (Must persist between reads)
b32b8144
FG
155 http::request<http::empty_body> req_;
156 http::response<http::string_body> res_;
157
158public:
159 worker(worker&&) = default;
160
161 // Resolver and socket require an io_context
162 worker(
163 crawl_report& report,
92f5a8d4 164 net::io_context& ioc)
b32b8144 165 : report_(report)
20effc67
TL
166 , ex_(net::make_strand(ioc.get_executor()))
167 , resolver_(ex_)
168 , stream_(ex_)
b32b8144
FG
169 {
170 // Set up the common fields of the request
171 req_.version(11);
172 req_.method(http::verb::get);
173 req_.target("/");
174 req_.set(http::field::user_agent, BOOST_BEAST_VERSION_STRING);
175 }
176
177 // Start the asynchronous operation
178 void
179 run()
180 {
b32b8144
FG
181 do_get_host();
182 }
183
b32b8144
FG
184 void
185 do_get_host()
186 {
187 // Grab another host
188 auto const host = report_.get_host();
189
190 // nullptr means no more work
191 if(! host)
b32b8144 192 return;
b32b8144
FG
193
194 // The Host HTTP field is required
195 req_.set(http::field::host, host);
196
b32b8144
FG
197 // Set up an HTTP GET request message
198 // Look up the domain name
199 resolver_.async_resolve(
200 host,
201 "http",
92f5a8d4
TL
202 beast::bind_front_handler(
203 &worker::on_resolve,
204 shared_from_this()));
b32b8144
FG
205 }
206
207 void
208 on_resolve(
92f5a8d4 209 beast::error_code ec,
b32b8144
FG
210 tcp::resolver::results_type results)
211 {
212 if(ec)
213 {
214 report_.aggregate(
215 [](crawl_report& rep)
216 {
217 ++rep.resolve_failures;
218 });
219 return do_get_host();
220 }
221
92f5a8d4
TL
222 // Set a timeout on the operation
223 stream_.expires_after(std::chrono::seconds(10));
b32b8144
FG
224
225 // Make the connection on the IP address we get from a lookup
92f5a8d4
TL
226 stream_.async_connect(
227 results,
228 beast::bind_front_handler(
229 &worker::on_connect,
230 shared_from_this()));
b32b8144
FG
231 }
232
233 void
92f5a8d4 234 on_connect(beast::error_code ec, tcp::resolver::results_type::endpoint_type)
b32b8144
FG
235 {
236 if(ec)
237 {
238 report_.aggregate(
239 [](crawl_report& rep)
240 {
241 ++rep.connect_failures;
242 });
243 return do_get_host();
244 }
245
92f5a8d4
TL
246 // Set a timeout on the operation
247 stream_.expires_after(std::chrono::seconds(10));
b32b8144
FG
248
249 // Send the HTTP request to the remote host
250 http::async_write(
92f5a8d4 251 stream_,
b32b8144 252 req_,
92f5a8d4
TL
253 beast::bind_front_handler(
254 &worker::on_write,
255 shared_from_this()));
b32b8144
FG
256 }
257
258 void
259 on_write(
92f5a8d4 260 beast::error_code ec,
b32b8144
FG
261 std::size_t bytes_transferred)
262 {
263 boost::ignore_unused(bytes_transferred);
264
265 if(ec)
266 {
267 report_.aggregate(
268 [](crawl_report& rep)
269 {
270 ++rep.write_failures;
271 });
272 return do_get_host();
273 }
b32b8144
FG
274
275 // Receive the HTTP response
92f5a8d4 276 res_ = {};
b32b8144 277 http::async_read(
92f5a8d4 278 stream_,
b32b8144
FG
279 buffer_,
280 res_,
92f5a8d4
TL
281 beast::bind_front_handler(
282 &worker::on_read,
283 shared_from_this()));
b32b8144
FG
284 }
285
286 void
287 on_read(
92f5a8d4 288 beast::error_code ec,
b32b8144
FG
289 std::size_t bytes_transferred)
290 {
291 boost::ignore_unused(bytes_transferred);
292
293 if(ec)
294 {
295 report_.aggregate(
296 [](crawl_report& rep)
297 {
298 ++rep.read_failures;
299 });
300 return do_get_host();
301 }
302
303 auto const code = res_.result_int();
304 report_.aggregate(
305 [code](crawl_report& rep)
306 {
307 ++rep.success;
308 ++rep.status_codes[code];
309 });
310
311 // Gracefully close the socket
92f5a8d4
TL
312 stream_.socket().shutdown(tcp::socket::shutdown_both, ec);
313 stream_.close();
b32b8144
FG
314
315 // If we get here then the connection is closed gracefully
316
317 do_get_host();
318 }
319};
320
321class timer
322{
323 using clock_type = chrono::system_clock;
324
325 clock_type::time_point when_;
326
327public:
328 using duration = clock_type::duration;
329
330 timer()
331 : when_(clock_type::now())
332 {
333 }
334
335 duration
336 elapsed() const
337 {
338 return clock_type::now() - when_;
339 }
340};
341
342int main(int argc, char* argv[])
343{
344 // Check command line arguments.
345 if (argc != 2)
346 {
347 std::cerr <<
348 "Usage: http-crawl <threads>\n" <<
349 "Example:\n" <<
20effc67 350 " http-crawl 100\n";
b32b8144
FG
351 return EXIT_FAILURE;
352 }
353 auto const threads = std::max<int>(1, std::atoi(argv[1]));
354
20effc67 355 // The io_context is used to aggregate the statistics
92f5a8d4 356 net::io_context ioc;
b32b8144 357
b32b8144
FG
358 // The report holds the aggregated statistics
359 crawl_report report{ioc};
360
361 timer t;
362
363 // Create and launch the worker threads.
364 std::vector<std::thread> workers;
365 workers.reserve(threads + 1);
366 for(int i = 0; i < threads; ++i)
20effc67
TL
367 {
368 // Each worker will eventually add some data to the aggregated
369 // report. Outstanding work is tracked in each worker to
370 // represent the forthcoming delivery of this data by that
371 // worker.
372 auto reporting_work = net::require(
373 ioc.get_executor(),
374 net::execution::outstanding_work.tracked);
375
b32b8144 376 workers.emplace_back(
20effc67
TL
377 [&report, reporting_work] {
378 // We use a separate io_context for each worker because
379 // the asio resolver simulates asynchronous operation using
380 // a dedicated worker thread per io_context, and we want to
381 // do a lot of name resolutions in parallel.
382 net::io_context ioc;
383 std::make_shared<worker>(report, ioc)->run();
384 ioc.run();
385 });
386 }
b32b8144
FG
387
388 // Add another thread to run the main io_context which
389 // is used to aggregate the statistics
390 workers.emplace_back(
391 [&ioc]
392 {
393 ioc.run();
394 });
395
396 // Now block until all threads exit
397 for(std::size_t i = 0; i < workers.size(); ++i)
20effc67 398 workers[i].join();
b32b8144
FG
399
400 std::cout <<
401 "Elapsed time: " << chrono::duration_cast<chrono::seconds>(t.elapsed()).count() << " seconds\n";
402 std::cout << report;
403
404 return EXIT_SUCCESS;
405}