]>
Commit | Line | Data |
---|---|---|
b32b8144 | 1 | // |
92f5a8d4 | 2 | // Copyright (c) 2016-2019 Vinnie Falco (vinnie dot falco at gmail dot com) |
b32b8144 FG |
3 | // |
4 | // Distributed under the Boost Software License, Version 1.0. (See accompanying | |
5 | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) | |
6 | // | |
7 | // Official repository: https://github.com/boostorg/beast | |
8 | // | |
9 | ||
10 | //------------------------------------------------------------------------------ | |
11 | // | |
12 | // Example: HTTP crawl (asynchronous) | |
13 | // | |
14 | //------------------------------------------------------------------------------ | |
15 | ||
16 | #include "urls_large_data.hpp" | |
17 | ||
18 | #include <boost/beast/core.hpp> | |
19 | #include <boost/beast/http.hpp> | |
20 | #include <boost/beast/version.hpp> | |
21 | #include <boost/asio/bind_executor.hpp> | |
22 | #include <boost/asio/connect.hpp> | |
23 | #include <boost/asio/ip/tcp.hpp> | |
24 | #include <boost/asio/post.hpp> | |
25 | #include <boost/asio/strand.hpp> | |
26 | #include <atomic> | |
27 | #include <chrono> | |
28 | #include <cstdlib> | |
29 | #include <functional> | |
30 | #include <iomanip> | |
31 | #include <iostream> | |
32 | #include <memory> | |
33 | #include <string> | |
34 | #include <thread> | |
35 | #include <vector> | |
36 | #include <map> | |
37 | ||
b32b8144 | 38 | namespace chrono = std::chrono; // from <chrono> |
92f5a8d4 TL |
39 | namespace beast = boost::beast; // from <boost/beast.hpp> |
40 | namespace http = beast::http; // from <boost/beast/http.hpp> | |
41 | namespace net = boost::asio; // from <boost/asio.hpp> | |
42 | using tcp = net::ip::tcp; // from <boost/asio/ip/tcp.hpp> | |
b32b8144 FG |
43 | |
44 | //------------------------------------------------------------------------------ | |
45 | ||
46 | // This structure aggregates statistics on all the sites | |
47 | class crawl_report | |
48 | { | |
92f5a8d4 TL |
49 | net::io_context& ioc_; |
50 | net::strand< | |
51 | net::io_context::executor_type> strand_; | |
b32b8144 FG |
52 | std::atomic<std::size_t> index_; |
53 | std::vector<char const*> const& hosts_; | |
54 | std::size_t count_ = 0; | |
55 | ||
56 | public: | |
92f5a8d4 | 57 | crawl_report(net::io_context& ioc) |
b32b8144 FG |
58 | : ioc_(ioc) |
59 | , strand_(ioc_.get_executor()) | |
60 | , index_(0) | |
61 | , hosts_(urls_large_data()) | |
62 | { | |
63 | } | |
64 | ||
65 | // Run an aggregation function on the strand. | |
66 | // This allows synchronization without a mutex. | |
67 | template<class F> | |
68 | void | |
69 | aggregate(F const& f) | |
70 | { | |
92f5a8d4 | 71 | net::post( |
b32b8144 FG |
72 | strand_, |
73 | [&, f] | |
74 | { | |
75 | f(*this); | |
76 | if(count_ % 100 == 0) | |
77 | { | |
78 | std::cerr << | |
79 | "Progress: " << count_ << " of " << hosts_.size() << "\n"; | |
80 | //std::cerr << *this; | |
81 | } | |
82 | ++count_; | |
83 | }); | |
84 | } | |
85 | ||
86 | // Returns the next host to check | |
87 | char const* | |
88 | get_host() | |
89 | { | |
90 | auto const n = index_++; | |
91 | if(n >= hosts_.size()) | |
92 | return nullptr; | |
93 | return hosts_[n]; | |
94 | } | |
95 | ||
96 | // Counts the number of timer failures | |
97 | std::size_t timer_failures = 0; | |
98 | ||
99 | // Counts the number of name resolution failures | |
100 | std::size_t resolve_failures = 0; | |
101 | ||
102 | // Counts the number of connection failures | |
103 | std::size_t connect_failures = 0; | |
104 | ||
105 | // Counts the number of write failures | |
106 | std::size_t write_failures = 0; | |
107 | ||
108 | // Counts the number of read failures | |
109 | std::size_t read_failures = 0; | |
110 | ||
111 | // Counts the number of success reads | |
112 | std::size_t success = 0; | |
113 | ||
114 | // Counts the number received of each status code | |
115 | std::map<unsigned, std::size_t> status_codes; | |
116 | }; | |
117 | ||
118 | std::ostream& | |
119 | operator<<(std::ostream& os, crawl_report const& report) | |
120 | { | |
121 | // Print the report | |
122 | os << | |
123 | "Crawl report\n" << | |
124 | " Failure counts\n" << | |
125 | " Timer : " << report.timer_failures << "\n" << | |
126 | " Resolve : " << report.resolve_failures << "\n" << | |
127 | " Connect : " << report.connect_failures << "\n" << | |
128 | " Write : " << report.write_failures << "\n" << | |
129 | " Read : " << report.read_failures << "\n" << | |
130 | " Success : " << report.success << "\n" << | |
131 | " Status codes\n" | |
132 | ; | |
133 | for(auto const& result : report.status_codes) | |
134 | os << | |
135 | " " << std::setw(3) << result.first << ": " << result.second << | |
136 | " (" << http::obsolete_reason(static_cast<http::status>(result.first)) << ")\n"; | |
137 | os.flush(); | |
138 | return os; | |
139 | } | |
140 | ||
141 | //------------------------------------------------------------------------------ | |
142 | ||
143 | // Performs HTTP GET requests and aggregates the results into a report | |
144 | class worker : public std::enable_shared_from_this<worker> | |
145 | { | |
146 | enum | |
147 | { | |
148 | // Use a small timeout to keep things lively | |
149 | timeout = 5 | |
150 | }; | |
151 | ||
152 | crawl_report& report_; | |
153 | tcp::resolver resolver_; | |
92f5a8d4 TL |
154 | beast::tcp_stream stream_; |
155 | beast::flat_buffer buffer_; // (Must persist between reads) | |
b32b8144 FG |
156 | http::request<http::empty_body> req_; |
157 | http::response<http::string_body> res_; | |
158 | ||
159 | public: | |
160 | worker(worker&&) = default; | |
161 | ||
162 | // Resolver and socket require an io_context | |
163 | worker( | |
164 | crawl_report& report, | |
92f5a8d4 | 165 | net::io_context& ioc) |
b32b8144 | 166 | : report_(report) |
92f5a8d4 TL |
167 | , resolver_(net::make_strand(ioc)) |
168 | , stream_(net::make_strand(ioc)) | |
b32b8144 FG |
169 | { |
170 | // Set up the common fields of the request | |
171 | req_.version(11); | |
172 | req_.method(http::verb::get); | |
173 | req_.target("/"); | |
174 | req_.set(http::field::user_agent, BOOST_BEAST_VERSION_STRING); | |
175 | } | |
176 | ||
177 | // Start the asynchronous operation | |
178 | void | |
179 | run() | |
180 | { | |
b32b8144 FG |
181 | do_get_host(); |
182 | } | |
183 | ||
b32b8144 FG |
184 | void |
185 | do_get_host() | |
186 | { | |
187 | // Grab another host | |
188 | auto const host = report_.get_host(); | |
189 | ||
190 | // nullptr means no more work | |
191 | if(! host) | |
b32b8144 | 192 | return; |
b32b8144 FG |
193 | |
194 | // The Host HTTP field is required | |
195 | req_.set(http::field::host, host); | |
196 | ||
b32b8144 FG |
197 | // Set up an HTTP GET request message |
198 | // Look up the domain name | |
199 | resolver_.async_resolve( | |
200 | host, | |
201 | "http", | |
92f5a8d4 TL |
202 | beast::bind_front_handler( |
203 | &worker::on_resolve, | |
204 | shared_from_this())); | |
b32b8144 FG |
205 | } |
206 | ||
207 | void | |
208 | on_resolve( | |
92f5a8d4 | 209 | beast::error_code ec, |
b32b8144 FG |
210 | tcp::resolver::results_type results) |
211 | { | |
212 | if(ec) | |
213 | { | |
214 | report_.aggregate( | |
215 | [](crawl_report& rep) | |
216 | { | |
217 | ++rep.resolve_failures; | |
218 | }); | |
219 | return do_get_host(); | |
220 | } | |
221 | ||
92f5a8d4 TL |
222 | // Set a timeout on the operation |
223 | stream_.expires_after(std::chrono::seconds(10)); | |
b32b8144 FG |
224 | |
225 | // Make the connection on the IP address we get from a lookup | |
92f5a8d4 TL |
226 | stream_.async_connect( |
227 | results, | |
228 | beast::bind_front_handler( | |
229 | &worker::on_connect, | |
230 | shared_from_this())); | |
b32b8144 FG |
231 | } |
232 | ||
233 | void | |
92f5a8d4 | 234 | on_connect(beast::error_code ec, tcp::resolver::results_type::endpoint_type) |
b32b8144 FG |
235 | { |
236 | if(ec) | |
237 | { | |
238 | report_.aggregate( | |
239 | [](crawl_report& rep) | |
240 | { | |
241 | ++rep.connect_failures; | |
242 | }); | |
243 | return do_get_host(); | |
244 | } | |
245 | ||
92f5a8d4 TL |
246 | // Set a timeout on the operation |
247 | stream_.expires_after(std::chrono::seconds(10)); | |
b32b8144 FG |
248 | |
249 | // Send the HTTP request to the remote host | |
250 | http::async_write( | |
92f5a8d4 | 251 | stream_, |
b32b8144 | 252 | req_, |
92f5a8d4 TL |
253 | beast::bind_front_handler( |
254 | &worker::on_write, | |
255 | shared_from_this())); | |
b32b8144 FG |
256 | } |
257 | ||
258 | void | |
259 | on_write( | |
92f5a8d4 | 260 | beast::error_code ec, |
b32b8144 FG |
261 | std::size_t bytes_transferred) |
262 | { | |
263 | boost::ignore_unused(bytes_transferred); | |
264 | ||
265 | if(ec) | |
266 | { | |
267 | report_.aggregate( | |
268 | [](crawl_report& rep) | |
269 | { | |
270 | ++rep.write_failures; | |
271 | }); | |
272 | return do_get_host(); | |
273 | } | |
b32b8144 FG |
274 | |
275 | // Receive the HTTP response | |
92f5a8d4 | 276 | res_ = {}; |
b32b8144 | 277 | http::async_read( |
92f5a8d4 | 278 | stream_, |
b32b8144 FG |
279 | buffer_, |
280 | res_, | |
92f5a8d4 TL |
281 | beast::bind_front_handler( |
282 | &worker::on_read, | |
283 | shared_from_this())); | |
b32b8144 FG |
284 | } |
285 | ||
286 | void | |
287 | on_read( | |
92f5a8d4 | 288 | beast::error_code ec, |
b32b8144 FG |
289 | std::size_t bytes_transferred) |
290 | { | |
291 | boost::ignore_unused(bytes_transferred); | |
292 | ||
293 | if(ec) | |
294 | { | |
295 | report_.aggregate( | |
296 | [](crawl_report& rep) | |
297 | { | |
298 | ++rep.read_failures; | |
299 | }); | |
300 | return do_get_host(); | |
301 | } | |
302 | ||
303 | auto const code = res_.result_int(); | |
304 | report_.aggregate( | |
305 | [code](crawl_report& rep) | |
306 | { | |
307 | ++rep.success; | |
308 | ++rep.status_codes[code]; | |
309 | }); | |
310 | ||
311 | // Gracefully close the socket | |
92f5a8d4 TL |
312 | stream_.socket().shutdown(tcp::socket::shutdown_both, ec); |
313 | stream_.close(); | |
b32b8144 FG |
314 | |
315 | // If we get here then the connection is closed gracefully | |
316 | ||
317 | do_get_host(); | |
318 | } | |
319 | }; | |
320 | ||
321 | class timer | |
322 | { | |
323 | using clock_type = chrono::system_clock; | |
324 | ||
325 | clock_type::time_point when_; | |
326 | ||
327 | public: | |
328 | using duration = clock_type::duration; | |
329 | ||
330 | timer() | |
331 | : when_(clock_type::now()) | |
332 | { | |
333 | } | |
334 | ||
335 | duration | |
336 | elapsed() const | |
337 | { | |
338 | return clock_type::now() - when_; | |
339 | } | |
340 | }; | |
341 | ||
342 | int main(int argc, char* argv[]) | |
343 | { | |
344 | // Check command line arguments. | |
345 | if (argc != 2) | |
346 | { | |
347 | std::cerr << | |
348 | "Usage: http-crawl <threads>\n" << | |
349 | "Example:\n" << | |
350 | " http-crawl 100 1\n"; | |
351 | return EXIT_FAILURE; | |
352 | } | |
353 | auto const threads = std::max<int>(1, std::atoi(argv[1])); | |
354 | ||
355 | // The io_context is required for all I/O | |
92f5a8d4 | 356 | net::io_context ioc; |
b32b8144 FG |
357 | |
358 | // The work keeps io_context::run from returning | |
92f5a8d4 | 359 | auto work = net::make_work_guard(ioc); |
b32b8144 FG |
360 | |
361 | // The report holds the aggregated statistics | |
362 | crawl_report report{ioc}; | |
363 | ||
364 | timer t; | |
365 | ||
366 | // Create and launch the worker threads. | |
367 | std::vector<std::thread> workers; | |
368 | workers.reserve(threads + 1); | |
369 | for(int i = 0; i < threads; ++i) | |
370 | workers.emplace_back( | |
371 | [&report] | |
372 | { | |
373 | // We use a separate io_context for each worker because | |
374 | // the asio resolver simulates asynchronous operation using | |
375 | // a dedicated worker thread per io_context, and we want to | |
376 | // do a lot of name resolutions in parallel. | |
92f5a8d4 | 377 | net::io_context ioc{1}; |
b32b8144 FG |
378 | std::make_shared<worker>(report, ioc)->run(); |
379 | ioc.run(); | |
380 | }); | |
381 | ||
382 | // Add another thread to run the main io_context which | |
383 | // is used to aggregate the statistics | |
384 | workers.emplace_back( | |
385 | [&ioc] | |
386 | { | |
387 | ioc.run(); | |
388 | }); | |
389 | ||
390 | // Now block until all threads exit | |
391 | for(std::size_t i = 0; i < workers.size(); ++i) | |
392 | { | |
393 | auto& thread = workers[i]; | |
394 | ||
395 | // If this is the last thread, reset the | |
396 | // work object so that it can return from run. | |
397 | if(i == workers.size() - 1) | |
398 | work.reset(); | |
399 | ||
400 | // Wait for the thread to exit | |
401 | thread.join(); | |
402 | } | |
403 | ||
404 | std::cout << | |
405 | "Elapsed time: " << chrono::duration_cast<chrono::seconds>(t.elapsed()).count() << " seconds\n"; | |
406 | std::cout << report; | |
407 | ||
408 | return EXIT_SUCCESS; | |
409 | } |