package WWWgrab; =head1 NAME WWWgrab, wwwgrab, wwwget, wwwhead, wwwhttp =head1 SYNOPSIS use lib '/path_to_dir/this_module_is_in'; use WWWgrab; $www_page_contents = wwwgrab(URL); $response_headers_and_page = wwwget(URL); $response_headers_only = wwwhead(URL); $http_success_string_only = wwwhttp(URL); =head1 DESCRIPTION WWWgrab.pm is a perl5 module to get various response information from an http URL on the World Wide Web. It includes functions to get just the page contents, HTTP response headers with contents, just the HTTP response headers or just the first line of the response header as a string to validate the URL. URL is a full url to a directory or file. The URL can include a urlencoded query string to test CGI scripts the use the GET method. Examples of URL's: http://www.foo.com http://www.foo.com/bar/ http://www.foo.com/~bar/index.html http://www.foo/~bar/script.cgi?var1=2&var2=string+with+plus+for+spaces Last Modified 4/4/2003 by David Efflandt (efflandt@xnet.com) http://www.xnet.com/~efflandt/ Based on the script 'wwwgrab' by Jeff Ballard (ballard@cae.wisc.edu) http://www.engr.wisc.edu/~ballard/ This module is freeware. No warranty is expressed, implied, nor granted to you in any way. Use of this module is at your own risk. =item wwwgrab(URL) Returns www page contents or script output without response headers. Useful for copying a page from the Web. =item wwwget(URL) Returns HTTP response headers, a blank line, then page contents or CGI script results. Useful for testing CGI scripts to see if they return proper headers. =item wwwhead(URL) Returns full HTTP response headers without page contents. Can tell you webserver version, last modified date, content type, etc. =item wwwhttp(URL) Returns string containing the first line of HTTP response. Useful for validating links. Sample response: "HTTP/1.1 200 OK". =cut use Exporter (); @ISA = qw(Exporter); @EXPORT = qw(wwwgrab wwwget wwwhead wwwhttp); # Required for perl5 use Socket; # Page contents sub wwwgrab { local ($url) = @_; local $method = 'GET'; local $http = 'HTTP/1.0'; local $body = 1; &grab; } # Response headers and contents sub wwwget { local ($url) = @_; local $method = 'GET'; local $http = 'HTTP/1.0'; &grab; } # Response headers only (no contents) sub wwwhead { local ($url) = @_; local $method = 'HEAD'; local $http = 'HTTP/1.0'; &grab; } # Success string (1st line of HTTP response) sub wwwhttp { local ($url) = @_; local $method = 'HEAD'; local $http = 'HTTP/1.0'; $_ = &grab; /^([^\r\n]+)\r*\n/; $1; } # Request $url using $method and $http version sub grab { # Given an http address, rip it into its corresponding parts. $url =~ /http:\/\/([^\/]*)\/*([^ ]*)/; my $site = $1; my $file = "/$2"; return "Invalid URL" unless $site; $_ = $site; /^([^:]*):*([^ ]*)/; $site = $1; my $port = $2; $port = 80 unless $port; my $hostname = $site; # Uncomment for testing #print STDERR "[$site] [$port] [$file]\n"; # Assemble request my $request = "$method $file"; $request .= " $http" if $http; $request .= "\nHost: $site" if $http; # Uncomment for testing #print STDERR "$request\n\n"; # Open a socket and request the data my ($sockaddr,$there,$response,$tries) = ("Snc4x8"); $there = pack($sockaddr,2,$port, &getaddress($hostname)); my $proto = (getprotobyname ('tcp'))[2]; if (!socket(S,AF_INET,SOCK_STREAM,$proto)) { return "Socket Error: $!";} if (!connect(S,$there)) { return "Can't connect: $!"; } select(S);$|=1; select(STDOUT); print S "$request\n\n"; my $data; while() { my $tmp = $_; if ($body) { if ($tmp =~ /\S+/) {next;} else {$body = 0; next;} } $data .= $tmp; } close(S); $data; } sub getaddress { my($host) = @_; my(@ary); @ary = gethostbyname($host); return(unpack("C4",$ary[4])); }