xref: /aosp_15_r20/external/curl/docs/examples/crawler.c (revision 6236dae45794135f37c4eb022389c904c8b0090d)
1*6236dae4SAndroid Build Coastguard Worker /***************************************************************************
2*6236dae4SAndroid Build Coastguard Worker  *                                  _   _ ____  _
3*6236dae4SAndroid Build Coastguard Worker  *  Project                     ___| | | |  _ \| |
4*6236dae4SAndroid Build Coastguard Worker  *                             / __| | | | |_) | |
5*6236dae4SAndroid Build Coastguard Worker  *                            | (__| |_| |  _ <| |___
6*6236dae4SAndroid Build Coastguard Worker  *                             \___|\___/|_| \_\_____|
7*6236dae4SAndroid Build Coastguard Worker  *
8*6236dae4SAndroid Build Coastguard Worker  * Copyright (C) Jeroen Ooms <[email protected]>
9*6236dae4SAndroid Build Coastguard Worker  *
10*6236dae4SAndroid Build Coastguard Worker  * This software is licensed as described in the file COPYING, which
11*6236dae4SAndroid Build Coastguard Worker  * you should have received as part of this distribution. The terms
12*6236dae4SAndroid Build Coastguard Worker  * are also available at https://curl.se/docs/copyright.html.
13*6236dae4SAndroid Build Coastguard Worker  *
14*6236dae4SAndroid Build Coastguard Worker  * You may opt to use, copy, modify, merge, publish, distribute and/or sell
15*6236dae4SAndroid Build Coastguard Worker  * copies of the Software, and permit persons to whom the Software is
16*6236dae4SAndroid Build Coastguard Worker  * furnished to do so, under the terms of the COPYING file.
17*6236dae4SAndroid Build Coastguard Worker  *
18*6236dae4SAndroid Build Coastguard Worker  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
19*6236dae4SAndroid Build Coastguard Worker  * KIND, either express or implied.
20*6236dae4SAndroid Build Coastguard Worker  *
21*6236dae4SAndroid Build Coastguard Worker  * SPDX-License-Identifier: curl
22*6236dae4SAndroid Build Coastguard Worker  *
23*6236dae4SAndroid Build Coastguard Worker  * To compile:
24*6236dae4SAndroid Build Coastguard Worker  *   gcc crawler.c $(pkg-config --cflags --libs libxml-2.0 libcurl)
25*6236dae4SAndroid Build Coastguard Worker  *
26*6236dae4SAndroid Build Coastguard Worker  */
27*6236dae4SAndroid Build Coastguard Worker /* <DESC>
28*6236dae4SAndroid Build Coastguard Worker  * Web crawler based on curl and libxml2 to stress-test curl with
29*6236dae4SAndroid Build Coastguard Worker  * hundreds of concurrent connections to various servers.
30*6236dae4SAndroid Build Coastguard Worker  * </DESC>
31*6236dae4SAndroid Build Coastguard Worker  */
32*6236dae4SAndroid Build Coastguard Worker 
33*6236dae4SAndroid Build Coastguard Worker /* Parameters */
34*6236dae4SAndroid Build Coastguard Worker int max_con = 200;
35*6236dae4SAndroid Build Coastguard Worker int max_total = 20000;
36*6236dae4SAndroid Build Coastguard Worker int max_requests = 500;
37*6236dae4SAndroid Build Coastguard Worker int max_link_per_page = 5;
38*6236dae4SAndroid Build Coastguard Worker int follow_relative_links = 0;
39*6236dae4SAndroid Build Coastguard Worker char *start_page = "https://www.reuters.com";
40*6236dae4SAndroid Build Coastguard Worker 
41*6236dae4SAndroid Build Coastguard Worker #include <libxml/HTMLparser.h>
42*6236dae4SAndroid Build Coastguard Worker #include <libxml/xpath.h>
43*6236dae4SAndroid Build Coastguard Worker #include <libxml/uri.h>
44*6236dae4SAndroid Build Coastguard Worker #include <curl/curl.h>
45*6236dae4SAndroid Build Coastguard Worker #include <stdlib.h>
46*6236dae4SAndroid Build Coastguard Worker #include <string.h>
47*6236dae4SAndroid Build Coastguard Worker #include <math.h>
48*6236dae4SAndroid Build Coastguard Worker #include <signal.h>
49*6236dae4SAndroid Build Coastguard Worker 
50*6236dae4SAndroid Build Coastguard Worker int pending_interrupt = 0;
sighandler(int dummy)51*6236dae4SAndroid Build Coastguard Worker void sighandler(int dummy)
52*6236dae4SAndroid Build Coastguard Worker {
53*6236dae4SAndroid Build Coastguard Worker   pending_interrupt = 1;
54*6236dae4SAndroid Build Coastguard Worker }
55*6236dae4SAndroid Build Coastguard Worker 
56*6236dae4SAndroid Build Coastguard Worker /* resizable buffer */
57*6236dae4SAndroid Build Coastguard Worker typedef struct {
58*6236dae4SAndroid Build Coastguard Worker   char *buf;
59*6236dae4SAndroid Build Coastguard Worker   size_t size;
60*6236dae4SAndroid Build Coastguard Worker } memory;
61*6236dae4SAndroid Build Coastguard Worker 
grow_buffer(void * contents,size_t sz,size_t nmemb,void * ctx)62*6236dae4SAndroid Build Coastguard Worker size_t grow_buffer(void *contents, size_t sz, size_t nmemb, void *ctx)
63*6236dae4SAndroid Build Coastguard Worker {
64*6236dae4SAndroid Build Coastguard Worker   size_t realsize = sz * nmemb;
65*6236dae4SAndroid Build Coastguard Worker   memory *mem = (memory*) ctx;
66*6236dae4SAndroid Build Coastguard Worker   char *ptr = realloc(mem->buf, mem->size + realsize);
67*6236dae4SAndroid Build Coastguard Worker   if(!ptr) {
68*6236dae4SAndroid Build Coastguard Worker     /* out of memory */
69*6236dae4SAndroid Build Coastguard Worker     printf("not enough memory (realloc returned NULL)\n");
70*6236dae4SAndroid Build Coastguard Worker     return 0;
71*6236dae4SAndroid Build Coastguard Worker   }
72*6236dae4SAndroid Build Coastguard Worker   mem->buf = ptr;
73*6236dae4SAndroid Build Coastguard Worker   memcpy(&(mem->buf[mem->size]), contents, realsize);
74*6236dae4SAndroid Build Coastguard Worker   mem->size += realsize;
75*6236dae4SAndroid Build Coastguard Worker   return realsize;
76*6236dae4SAndroid Build Coastguard Worker }
77*6236dae4SAndroid Build Coastguard Worker 
make_handle(char * url)78*6236dae4SAndroid Build Coastguard Worker CURL *make_handle(char *url)
79*6236dae4SAndroid Build Coastguard Worker {
80*6236dae4SAndroid Build Coastguard Worker   CURL *handle = curl_easy_init();
81*6236dae4SAndroid Build Coastguard Worker 
82*6236dae4SAndroid Build Coastguard Worker   /* Important: use HTTP2 over HTTPS */
83*6236dae4SAndroid Build Coastguard Worker   curl_easy_setopt(handle, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2TLS);
84*6236dae4SAndroid Build Coastguard Worker   curl_easy_setopt(handle, CURLOPT_URL, url);
85*6236dae4SAndroid Build Coastguard Worker 
86*6236dae4SAndroid Build Coastguard Worker   /* buffer body */
87*6236dae4SAndroid Build Coastguard Worker   memory *mem = malloc(sizeof(memory));
88*6236dae4SAndroid Build Coastguard Worker   mem->size = 0;
89*6236dae4SAndroid Build Coastguard Worker   mem->buf = malloc(1);
90*6236dae4SAndroid Build Coastguard Worker   curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, grow_buffer);
91*6236dae4SAndroid Build Coastguard Worker   curl_easy_setopt(handle, CURLOPT_WRITEDATA, mem);
92*6236dae4SAndroid Build Coastguard Worker   curl_easy_setopt(handle, CURLOPT_PRIVATE, mem);
93*6236dae4SAndroid Build Coastguard Worker 
94*6236dae4SAndroid Build Coastguard Worker   /* For completeness */
95*6236dae4SAndroid Build Coastguard Worker   curl_easy_setopt(handle, CURLOPT_ACCEPT_ENCODING, "");
96*6236dae4SAndroid Build Coastguard Worker   curl_easy_setopt(handle, CURLOPT_TIMEOUT, 5L);
97*6236dae4SAndroid Build Coastguard Worker   curl_easy_setopt(handle, CURLOPT_FOLLOWLOCATION, 1L);
98*6236dae4SAndroid Build Coastguard Worker   /* only allow redirects to HTTP and HTTPS URLs */
99*6236dae4SAndroid Build Coastguard Worker   curl_easy_setopt(handle, CURLOPT_REDIR_PROTOCOLS_STR, "http,https");
100*6236dae4SAndroid Build Coastguard Worker   curl_easy_setopt(handle, CURLOPT_AUTOREFERER, 1L);
101*6236dae4SAndroid Build Coastguard Worker   curl_easy_setopt(handle, CURLOPT_MAXREDIRS, 10L);
102*6236dae4SAndroid Build Coastguard Worker   /* each transfer needs to be done within 20 seconds! */
103*6236dae4SAndroid Build Coastguard Worker   curl_easy_setopt(handle, CURLOPT_TIMEOUT_MS, 20000L);
104*6236dae4SAndroid Build Coastguard Worker   /* connect fast or fail */
105*6236dae4SAndroid Build Coastguard Worker   curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT_MS, 2000L);
106*6236dae4SAndroid Build Coastguard Worker   /* skip files larger than a gigabyte */
107*6236dae4SAndroid Build Coastguard Worker   curl_easy_setopt(handle, CURLOPT_MAXFILESIZE_LARGE,
108*6236dae4SAndroid Build Coastguard Worker                    (curl_off_t)1024*1024*1024);
109*6236dae4SAndroid Build Coastguard Worker   curl_easy_setopt(handle, CURLOPT_COOKIEFILE, "");
110*6236dae4SAndroid Build Coastguard Worker   curl_easy_setopt(handle, CURLOPT_FILETIME, 1L);
111*6236dae4SAndroid Build Coastguard Worker   curl_easy_setopt(handle, CURLOPT_USERAGENT, "mini crawler");
112*6236dae4SAndroid Build Coastguard Worker   curl_easy_setopt(handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
113*6236dae4SAndroid Build Coastguard Worker   curl_easy_setopt(handle, CURLOPT_UNRESTRICTED_AUTH, 1L);
114*6236dae4SAndroid Build Coastguard Worker   curl_easy_setopt(handle, CURLOPT_PROXYAUTH, CURLAUTH_ANY);
115*6236dae4SAndroid Build Coastguard Worker   curl_easy_setopt(handle, CURLOPT_EXPECT_100_TIMEOUT_MS, 0L);
116*6236dae4SAndroid Build Coastguard Worker   return handle;
117*6236dae4SAndroid Build Coastguard Worker }
118*6236dae4SAndroid Build Coastguard Worker 
119*6236dae4SAndroid Build Coastguard Worker /* HREF finder implemented in libxml2 but could be any HTML parser */
follow_links(CURLM * multi_handle,memory * mem,char * url)120*6236dae4SAndroid Build Coastguard Worker size_t follow_links(CURLM *multi_handle, memory *mem, char *url)
121*6236dae4SAndroid Build Coastguard Worker {
122*6236dae4SAndroid Build Coastguard Worker   int opts = HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | \
123*6236dae4SAndroid Build Coastguard Worker              HTML_PARSE_NOWARNING | HTML_PARSE_NONET;
124*6236dae4SAndroid Build Coastguard Worker   htmlDocPtr doc = htmlReadMemory(mem->buf, mem->size, url, NULL, opts);
125*6236dae4SAndroid Build Coastguard Worker   if(!doc)
126*6236dae4SAndroid Build Coastguard Worker     return 0;
127*6236dae4SAndroid Build Coastguard Worker   xmlChar *xpath = (xmlChar*) "//a/@href";
128*6236dae4SAndroid Build Coastguard Worker   xmlXPathContextPtr context = xmlXPathNewContext(doc);
129*6236dae4SAndroid Build Coastguard Worker   xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, context);
130*6236dae4SAndroid Build Coastguard Worker   xmlXPathFreeContext(context);
131*6236dae4SAndroid Build Coastguard Worker   if(!result)
132*6236dae4SAndroid Build Coastguard Worker     return 0;
133*6236dae4SAndroid Build Coastguard Worker   xmlNodeSetPtr nodeset = result->nodesetval;
134*6236dae4SAndroid Build Coastguard Worker   if(xmlXPathNodeSetIsEmpty(nodeset)) {
135*6236dae4SAndroid Build Coastguard Worker     xmlXPathFreeObject(result);
136*6236dae4SAndroid Build Coastguard Worker     return 0;
137*6236dae4SAndroid Build Coastguard Worker   }
138*6236dae4SAndroid Build Coastguard Worker   size_t count = 0;
139*6236dae4SAndroid Build Coastguard Worker   int i;
140*6236dae4SAndroid Build Coastguard Worker   for(i = 0; i < nodeset->nodeNr; i++) {
141*6236dae4SAndroid Build Coastguard Worker     double r = rand();
142*6236dae4SAndroid Build Coastguard Worker     int x = r * nodeset->nodeNr / RAND_MAX;
143*6236dae4SAndroid Build Coastguard Worker     const xmlNode *node = nodeset->nodeTab[x]->xmlChildrenNode;
144*6236dae4SAndroid Build Coastguard Worker     xmlChar *href = xmlNodeListGetString(doc, node, 1);
145*6236dae4SAndroid Build Coastguard Worker     if(follow_relative_links) {
146*6236dae4SAndroid Build Coastguard Worker       xmlChar *orig = href;
147*6236dae4SAndroid Build Coastguard Worker       href = xmlBuildURI(href, (xmlChar *) url);
148*6236dae4SAndroid Build Coastguard Worker       xmlFree(orig);
149*6236dae4SAndroid Build Coastguard Worker     }
150*6236dae4SAndroid Build Coastguard Worker     char *link = (char *) href;
151*6236dae4SAndroid Build Coastguard Worker     if(!link || strlen(link) < 20)
152*6236dae4SAndroid Build Coastguard Worker       continue;
153*6236dae4SAndroid Build Coastguard Worker     if(!strncmp(link, "http://", 7) || !strncmp(link, "https://", 8)) {
154*6236dae4SAndroid Build Coastguard Worker       curl_multi_add_handle(multi_handle, make_handle(link));
155*6236dae4SAndroid Build Coastguard Worker       if(count++ == max_link_per_page)
156*6236dae4SAndroid Build Coastguard Worker         break;
157*6236dae4SAndroid Build Coastguard Worker     }
158*6236dae4SAndroid Build Coastguard Worker     xmlFree(link);
159*6236dae4SAndroid Build Coastguard Worker   }
160*6236dae4SAndroid Build Coastguard Worker   xmlXPathFreeObject(result);
161*6236dae4SAndroid Build Coastguard Worker   return count;
162*6236dae4SAndroid Build Coastguard Worker }
163*6236dae4SAndroid Build Coastguard Worker 
is_html(char * ctype)164*6236dae4SAndroid Build Coastguard Worker int is_html(char *ctype)
165*6236dae4SAndroid Build Coastguard Worker {
166*6236dae4SAndroid Build Coastguard Worker   return ctype != NULL && strlen(ctype) > 10 && strstr(ctype, "text/html");
167*6236dae4SAndroid Build Coastguard Worker }
168*6236dae4SAndroid Build Coastguard Worker 
main(void)169*6236dae4SAndroid Build Coastguard Worker int main(void)
170*6236dae4SAndroid Build Coastguard Worker {
171*6236dae4SAndroid Build Coastguard Worker   signal(SIGINT, sighandler);
172*6236dae4SAndroid Build Coastguard Worker   LIBXML_TEST_VERSION;
173*6236dae4SAndroid Build Coastguard Worker   curl_global_init(CURL_GLOBAL_DEFAULT);
174*6236dae4SAndroid Build Coastguard Worker   CURLM *multi_handle = curl_multi_init();
175*6236dae4SAndroid Build Coastguard Worker   curl_multi_setopt(multi_handle, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_con);
176*6236dae4SAndroid Build Coastguard Worker   curl_multi_setopt(multi_handle, CURLMOPT_MAX_HOST_CONNECTIONS, 6L);
177*6236dae4SAndroid Build Coastguard Worker 
178*6236dae4SAndroid Build Coastguard Worker   /* enables http/2 if available */
179*6236dae4SAndroid Build Coastguard Worker #ifdef CURLPIPE_MULTIPLEX
180*6236dae4SAndroid Build Coastguard Worker   curl_multi_setopt(multi_handle, CURLMOPT_PIPELINING, CURLPIPE_MULTIPLEX);
181*6236dae4SAndroid Build Coastguard Worker #endif
182*6236dae4SAndroid Build Coastguard Worker 
183*6236dae4SAndroid Build Coastguard Worker   /* sets html start page */
184*6236dae4SAndroid Build Coastguard Worker   curl_multi_add_handle(multi_handle, make_handle(start_page));
185*6236dae4SAndroid Build Coastguard Worker 
186*6236dae4SAndroid Build Coastguard Worker   int msgs_left;
187*6236dae4SAndroid Build Coastguard Worker   int pending = 0;
188*6236dae4SAndroid Build Coastguard Worker   int complete = 0;
189*6236dae4SAndroid Build Coastguard Worker   int still_running = 1;
190*6236dae4SAndroid Build Coastguard Worker   while(still_running && !pending_interrupt) {
191*6236dae4SAndroid Build Coastguard Worker     int numfds;
192*6236dae4SAndroid Build Coastguard Worker     curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds);
193*6236dae4SAndroid Build Coastguard Worker     curl_multi_perform(multi_handle, &still_running);
194*6236dae4SAndroid Build Coastguard Worker 
195*6236dae4SAndroid Build Coastguard Worker     /* See how the transfers went */
196*6236dae4SAndroid Build Coastguard Worker     CURLMsg *m = NULL;
197*6236dae4SAndroid Build Coastguard Worker     while((m = curl_multi_info_read(multi_handle, &msgs_left))) {
198*6236dae4SAndroid Build Coastguard Worker       if(m->msg == CURLMSG_DONE) {
199*6236dae4SAndroid Build Coastguard Worker         CURL *handle = m->easy_handle;
200*6236dae4SAndroid Build Coastguard Worker         char *url;
201*6236dae4SAndroid Build Coastguard Worker         memory *mem;
202*6236dae4SAndroid Build Coastguard Worker         curl_easy_getinfo(handle, CURLINFO_PRIVATE, &mem);
203*6236dae4SAndroid Build Coastguard Worker         curl_easy_getinfo(handle, CURLINFO_EFFECTIVE_URL, &url);
204*6236dae4SAndroid Build Coastguard Worker         if(m->data.result == CURLE_OK) {
205*6236dae4SAndroid Build Coastguard Worker           long res_status;
206*6236dae4SAndroid Build Coastguard Worker           curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &res_status);
207*6236dae4SAndroid Build Coastguard Worker           if(res_status == 200) {
208*6236dae4SAndroid Build Coastguard Worker             char *ctype;
209*6236dae4SAndroid Build Coastguard Worker             curl_easy_getinfo(handle, CURLINFO_CONTENT_TYPE, &ctype);
210*6236dae4SAndroid Build Coastguard Worker             printf("[%d] HTTP 200 (%s): %s\n", complete, ctype, url);
211*6236dae4SAndroid Build Coastguard Worker             if(is_html(ctype) && mem->size > 100) {
212*6236dae4SAndroid Build Coastguard Worker               if(pending < max_requests && (complete + pending) < max_total) {
213*6236dae4SAndroid Build Coastguard Worker                 pending += follow_links(multi_handle, mem, url);
214*6236dae4SAndroid Build Coastguard Worker                 still_running = 1;
215*6236dae4SAndroid Build Coastguard Worker               }
216*6236dae4SAndroid Build Coastguard Worker             }
217*6236dae4SAndroid Build Coastguard Worker           }
218*6236dae4SAndroid Build Coastguard Worker           else {
219*6236dae4SAndroid Build Coastguard Worker             printf("[%d] HTTP %d: %s\n", complete, (int) res_status, url);
220*6236dae4SAndroid Build Coastguard Worker           }
221*6236dae4SAndroid Build Coastguard Worker         }
222*6236dae4SAndroid Build Coastguard Worker         else {
223*6236dae4SAndroid Build Coastguard Worker           printf("[%d] Connection failure: %s\n", complete, url);
224*6236dae4SAndroid Build Coastguard Worker         }
225*6236dae4SAndroid Build Coastguard Worker         curl_multi_remove_handle(multi_handle, handle);
226*6236dae4SAndroid Build Coastguard Worker         curl_easy_cleanup(handle);
227*6236dae4SAndroid Build Coastguard Worker         free(mem->buf);
228*6236dae4SAndroid Build Coastguard Worker         free(mem);
229*6236dae4SAndroid Build Coastguard Worker         complete++;
230*6236dae4SAndroid Build Coastguard Worker         pending--;
231*6236dae4SAndroid Build Coastguard Worker       }
232*6236dae4SAndroid Build Coastguard Worker     }
233*6236dae4SAndroid Build Coastguard Worker   }
234*6236dae4SAndroid Build Coastguard Worker   curl_multi_cleanup(multi_handle);
235*6236dae4SAndroid Build Coastguard Worker   curl_global_cleanup();
236*6236dae4SAndroid Build Coastguard Worker   return 0;
237*6236dae4SAndroid Build Coastguard Worker }
238