2 // Copyright (c) 2016-2017 Vinnie Falco (vinnie dot falco at gmail dot com)
4 // Distributed under the Boost Software License, Version 1.0. (See accompanying
5 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
7 // Official repository: https://github.com/boostorg/beast
10 //------------------------------------------------------------------------------
12 // Example: HTTP crawl (asynchronous)
14 //------------------------------------------------------------------------------
16 #include "urls_large_data.hpp"
18 #include <boost/beast/core.hpp>
19 #include <boost/beast/http.hpp>
20 #include <boost/beast/version.hpp>
21 #include <boost/asio/bind_executor.hpp>
22 #include <boost/asio/connect.hpp>
23 #include <boost/asio/ip/tcp.hpp>
24 #include <boost/asio/post.hpp>
25 #include <boost/asio/strand.hpp>
38 using tcp
= boost::asio::ip::tcp
; // from <boost/asio/ip/tcp.hpp>
39 namespace http
= boost::beast::http
; // from <boost/beast/http.hpp>
40 namespace chrono
= std::chrono
; // from <chrono>
42 //------------------------------------------------------------------------------
44 // This structure aggregates statistics on all the sites
47 boost::asio::io_context
& ioc_
;
49 boost::asio::io_context::executor_type
> strand_
;
50 std::atomic
<std::size_t> index_
;
51 std::vector
<char const*> const& hosts_
;
52 std::size_t count_
= 0;
55 crawl_report(boost::asio::io_context
& ioc
)
57 , strand_(ioc_
.get_executor())
59 , hosts_(urls_large_data())
63 // Run an aggregation function on the strand.
64 // This allows synchronization without a mutex.
77 "Progress: " << count_
<< " of " << hosts_
.size() << "\n";
84 // Returns the next host to check
88 auto const n
= index_
++;
89 if(n
>= hosts_
.size())
94 // Counts the number of timer failures
95 std::size_t timer_failures
= 0;
97 // Counts the number of name resolution failures
98 std::size_t resolve_failures
= 0;
100 // Counts the number of connection failures
101 std::size_t connect_failures
= 0;
103 // Counts the number of write failures
104 std::size_t write_failures
= 0;
106 // Counts the number of read failures
107 std::size_t read_failures
= 0;
109 // Counts the number of success reads
110 std::size_t success
= 0;
112 // Counts the number received of each status code
113 std::map
<unsigned, std::size_t> status_codes
;
117 operator<<(std::ostream
& os
, crawl_report
const& report
)
122 " Failure counts\n" <<
123 " Timer : " << report
.timer_failures
<< "\n" <<
124 " Resolve : " << report
.resolve_failures
<< "\n" <<
125 " Connect : " << report
.connect_failures
<< "\n" <<
126 " Write : " << report
.write_failures
<< "\n" <<
127 " Read : " << report
.read_failures
<< "\n" <<
128 " Success : " << report
.success
<< "\n" <<
131 for(auto const& result
: report
.status_codes
)
133 " " << std::setw(3) << result
.first
<< ": " << result
.second
<<
134 " (" << http::obsolete_reason(static_cast<http::status
>(result
.first
)) << ")\n";
139 //------------------------------------------------------------------------------
141 // Performs HTTP GET requests and aggregates the results into a report
142 class worker
: public std::enable_shared_from_this
<worker
>
146 // Use a small timeout to keep things lively
150 crawl_report
& report_
;
151 tcp::resolver resolver_
;
153 boost::asio::steady_timer timer_
;
155 boost::asio::io_context::executor_type
> strand_
;
156 boost::beast::flat_buffer buffer_
; // (Must persist between reads)
157 http::request
<http::empty_body
> req_
;
158 http::response
<http::string_body
> res_
;
161 worker(worker
&&) = default;
163 // Resolver and socket require an io_context
165 crawl_report
& report
,
166 boost::asio::io_context
& ioc
)
171 (chrono::steady_clock::time_point::max
)())
172 , strand_(ioc
.get_executor())
174 // Set up the common fields of the request
176 req_
.method(http::verb::get
);
178 req_
.set(http::field::user_agent
, BOOST_BEAST_VERSION_STRING
);
181 // Start the asynchronous operation
185 // Run the timer. The timer is operated
186 // continuously, this simplifies the code.
193 on_timer(boost::system::error_code ec
)
195 if(ec
&& ec
!= boost::asio::error::operation_aborted
)
197 // Should never happen
199 [](crawl_report
& rep
)
201 ++rep
.timer_failures
;
206 // Verify that the timer really expired since the deadline may have moved.
207 if(timer_
.expiry() <= chrono::steady_clock::now())
209 socket_
.shutdown(tcp::socket::shutdown_both
, ec
);
216 boost::asio::bind_executor(
221 std::placeholders::_1
)));
228 auto const host
= report_
.get_host();
230 // nullptr means no more work
237 // The Host HTTP field is required
238 req_
.set(http::field::host
, host
);
241 timer_
.expires_after(chrono::seconds(timeout
));
243 // Set up an HTTP GET request message
244 // Look up the domain name
245 resolver_
.async_resolve(
248 boost::asio::bind_executor(
253 std::placeholders::_1
,
254 std::placeholders::_2
)));
259 boost::system::error_code ec
,
260 tcp::resolver::results_type results
)
265 [](crawl_report
& rep
)
267 ++rep
.resolve_failures
;
269 return do_get_host();
273 timer_
.expires_after(chrono::seconds(timeout
));
275 // Make the connection on the IP address we get from a lookup
276 boost::asio::async_connect(
280 boost::asio::bind_executor(
285 std::placeholders::_1
)));
289 on_connect(boost::system::error_code ec
)
294 [](crawl_report
& rep
)
296 ++rep
.connect_failures
;
298 return do_get_host();
302 timer_
.expires_after(chrono::seconds(timeout
));
304 // Send the HTTP request to the remote host
308 boost::asio::bind_executor(
313 std::placeholders::_1
,
314 std::placeholders::_2
)));
319 boost::system::error_code ec
,
320 std::size_t bytes_transferred
)
322 boost::ignore_unused(bytes_transferred
);
327 [](crawl_report
& rep
)
329 ++rep
.write_failures
;
331 return do_get_host();
335 timer_
.expires_after(chrono::seconds(timeout
));
337 // Receive the HTTP response
342 boost::asio::bind_executor(
347 std::placeholders::_1
,
348 std::placeholders::_2
)));
353 boost::system::error_code ec
,
354 std::size_t bytes_transferred
)
356 boost::ignore_unused(bytes_transferred
);
361 [](crawl_report
& rep
)
365 return do_get_host();
368 auto const code
= res_
.result_int();
370 [code
](crawl_report
& rep
)
373 ++rep
.status_codes
[code
];
376 // Gracefully close the socket
377 socket_
.shutdown(tcp::socket::shutdown_both
, ec
);
380 // If we get here then the connection is closed gracefully
388 using clock_type
= chrono::system_clock
;
390 clock_type::time_point when_
;
393 using duration
= clock_type::duration
;
396 : when_(clock_type::now())
403 return clock_type::now() - when_
;
407 int main(int argc
, char* argv
[])
409 // Check command line arguments.
413 "Usage: http-crawl <threads>\n" <<
415 " http-crawl 100 1\n";
418 auto const threads
= std::max
<int>(1, std::atoi(argv
[1]));
420 // The io_context is required for all I/O
421 boost::asio::io_context ioc
{1};
423 // The work keeps io_context::run from returning
424 auto work
= boost::asio::make_work_guard(ioc
);
426 // The report holds the aggregated statistics
427 crawl_report report
{ioc
};
431 // Create and launch the worker threads.
432 std::vector
<std::thread
> workers
;
433 workers
.reserve(threads
+ 1);
434 for(int i
= 0; i
< threads
; ++i
)
435 workers
.emplace_back(
438 // We use a separate io_context for each worker because
439 // the asio resolver simulates asynchronous operation using
440 // a dedicated worker thread per io_context, and we want to
441 // do a lot of name resolutions in parallel.
442 boost::asio::io_context ioc
{1};
443 std::make_shared
<worker
>(report
, ioc
)->run();
447 // Add another thread to run the main io_context which
448 // is used to aggregate the statistics
449 workers
.emplace_back(
455 // Now block until all threads exit
456 for(std::size_t i
= 0; i
< workers
.size(); ++i
)
458 auto& thread
= workers
[i
];
460 // If this is the last thread, reset the
461 // work object so that it can return from run.
462 if(i
== workers
.size() - 1)
465 // Wait for the thread to exit
470 "Elapsed time: " << chrono::duration_cast
<chrono::seconds
>(t
.elapsed()).count() << " seconds\n";