/usr/share/doc/libcurl4-doc/examples/htmltidy.c is in libcurl4-doc 7.35.0-1ubuntu2.20.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 | /***************************************************************************
* _ _ ____ _
* Project ___| | | | _ \| |
* / __| | | | |_) | |
* | (__| |_| | _ <| |___
* \___|\___/|_| \_\_____|
*
* Copyright (C) 1998 - 2011, Daniel Stenberg, <daniel@haxx.se>, et al.
*
* This software is licensed as described in the file COPYING, which
* you should have received as part of this distribution. The terms
* are also available at http://curl.haxx.se/docs/copyright.html.
*
* You may opt to use, copy, modify, merge, publish, distribute and/or sell
* copies of the Software, and permit persons to whom the Software is
* furnished to do so, under the terms of the COPYING file.
*
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
* KIND, either express or implied.
*
***************************************************************************/
/* Download a document and use libtidy to parse the HTML.
* Written by Jeff Pohlmeyer
*
* LibTidy => http://tidy.sourceforge.net
*
* gcc -Wall -I/usr/local/include tidycurl.c -lcurl -ltidy -o tidycurl
*
*/
#include <stdio.h>
#include <tidy/tidy.h>
#include <tidy/buffio.h>
#include <curl/curl.h>
/* curl write callback, to fill tidy's input buffer... */
uint write_cb(char *in, uint size, uint nmemb, TidyBuffer *out)
{
uint r;
r = size * nmemb;
tidyBufAppend( out, in, r );
return(r);
}
/* Traverse the document tree */
void dumpNode(TidyDoc doc, TidyNode tnod, int indent )
{
TidyNode child;
for ( child = tidyGetChild(tnod); child; child = tidyGetNext(child) )
{
ctmbstr name = tidyNodeGetName( child );
if ( name )
{
/* if it has a name, then it's an HTML tag ... */
TidyAttr attr;
printf( "%*.*s%s ", indent, indent, "<", name);
/* walk the attribute list */
for ( attr=tidyAttrFirst(child); attr; attr=tidyAttrNext(attr) ) {
printf(tidyAttrName(attr));
tidyAttrValue(attr)?printf("=\"%s\" ",
tidyAttrValue(attr)):printf(" ");
}
printf( ">\n");
}
else {
/* if it doesn't have a name, then it's probably text, cdata, etc... */
TidyBuffer buf;
tidyBufInit(&buf);
tidyNodeGetText(doc, child, &buf);
printf("%*.*s\n", indent, indent, buf.bp?(char *)buf.bp:"");
tidyBufFree(&buf);
}
dumpNode( doc, child, indent + 4 ); /* recursive */
}
}
int main(int argc, char **argv )
{
CURL *curl;
char curl_errbuf[CURL_ERROR_SIZE];
TidyDoc tdoc;
TidyBuffer docbuf = {0};
TidyBuffer tidy_errbuf = {0};
int err;
if ( argc == 2) {
curl = curl_easy_init();
curl_easy_setopt(curl, CURLOPT_URL, argv[1]);
curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_errbuf);
curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
curl_easy_setopt(curl, CURLOPT_VERBOSE, 1L);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_cb);
tdoc = tidyCreate();
tidyOptSetBool(tdoc, TidyForceOutput, yes); /* try harder */
tidyOptSetInt(tdoc, TidyWrapLen, 4096);
tidySetErrorBuffer( tdoc, &tidy_errbuf );
tidyBufInit(&docbuf);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &docbuf);
err=curl_easy_perform(curl);
if ( !err ) {
err = tidyParseBuffer(tdoc, &docbuf); /* parse the input */
if ( err >= 0 ) {
err = tidyCleanAndRepair(tdoc); /* fix any problems */
if ( err >= 0 ) {
err = tidyRunDiagnostics(tdoc); /* load tidy error buffer */
if ( err >= 0 ) {
dumpNode( tdoc, tidyGetRoot(tdoc), 0 ); /* walk the tree */
fprintf(stderr, "%s\n", tidy_errbuf.bp); /* show errors */
}
}
}
}
else
fprintf(stderr, "%s\n", curl_errbuf);
/* clean-up */
curl_easy_cleanup(curl);
tidyBufFree(&docbuf);
tidyBufFree(&tidy_errbuf);
tidyRelease(tdoc);
return(err);
}
else
printf( "usage: %s <url>\n", argv[0] );
return(0);
}
|