#!/usr/bin/perl # searchspy - figure out what search engine queries are leading to your # web pages by looking at referrer information. # Usage: ./searchspy < access_log | sort # Nelson Minar http://www.media.mit.edu/~nelson/ # written on the way to LA Wed Feb 11 17:19:32 EST 1998 # $Id: searchspy,v 1.8 1999/07/21 21:49:36 nelson Exp nelson $ # Copyright (C) 1998 Nelson Minar all rights reserved. # Configurable variables: # a pattern matching the URLs you care about. (^ means match all). $searchURL = '^'; # $searchURL = '^/people/nelson'; # debug turns on some info about how this script could be failing # - prints out warnings about unknown search engines and weird fixups $debug = 0; # How it works: # Look through a standard access_log (at least, Apache 1.0.5) and # look for queries where the referrer has an ? in it. These are referrers # from a CGI script, and are almost always search engines. # If it has an ? in it, then the referrer is parsed into the %a hash # and then passed through the table %engineMap to extract the particular # field that's the search term. This is then passed through a filter # to turn the encoding of the search term into something readable, # dealing with +s and %xx. # All this parsing is quick and dirty. Please mail improvements # (especially patterns for more search engines) to the author. # Applytable, a handy function # Usage: applytable(string, reference-to-hashtable, arglist) # this will call function(arglist), where function is the right # one depending on the hash table, arglist is passed through unmodified. # Note: $& can be used to extract which string matched you. # Examples # using an existing hashtable, with a two element arglist # applytable($_, \%functionmap, 'lala', 'mama'); # using an anonymous hash, with one anonymous sub, no arglist # applytable("Foobar", {'xyzzy' => \&subNotUsed, # 'bar' => sub { print "Bar detected\n" }}); sub applytable { my $inputString = shift; my %fmap = %{shift()}; my @arglist = @_; my $k; foreach $k (keys(%fmap)) { if ($inputString =~ /$k/i) { return &{$fmap{$k}}(@arglist); } } return undef; } # This table controls how individual queries are parsed by search engine. # The left side is a URL that matches the search engine. Note, the # ordering here doesn't make a difference - avoid ambiguity. # The right side is a subroutine to return the string that is the actual # search query. See the call to applytable() to understand all the context. # $a{FIELD} is the value of the field named. %engineMap = ('metacrawler' => sub { $a{general}; }, 'mckinley\.com' => sub { $a{search}; }, 'lycos' => sub { $a{query}; }, 'www\.goo\.ne\.jp' => sub { $a{MT} }, 'www\.fireball\.de' => sub { $a{q} }, 'hotbot\.com' => sub { $a{MT}; }, 'infoseek\.com' => sub { "$a{qt}+$a{oq}"; }, 'planetsearch\.com' => sub { $a{text}; }, 'yahoo' => sub { $a{p}; }, 'snap\.com' => sub { $a{keyword}; }, 'excite' => sub { $a{FI_1} ? $a{FI_1} : ($a{search} ? $a{search} : $a{s}); }, 'designlab\.ukans\.edu' => sub { $a{queryTerm}; }, 'search\.metafind\.com' => sub { $a{q}; }, 'lokace' => sub { $a{MOTCLEF}; }, 'looksmart\.com' => sub { $a{key}; }, 'dogpile\.com' => sub { $a{q}; }, 'www\.sear\.ch' => sub { $a{q}; }, 'www\.nlsearch\.com' => sub { $a{qr}; }, 'www\.naver\.com' => sub { $a{query}; }, 'www\.mamma\.com' => sub { $a{query}; }, 'search\.com/Infoseek' => sub { $a{QUERY}; }, 'search\.com/AltaVista' => sub { $a{query}; }, 'netfind\.aol\.com' => sub { $a{search} ? $a{search} : $a{s}; }, 'webcrawler\.com' => sub { $a{searchText} ? $a{searchText} : ($a{search} ? $a{search} : $a{text}); }, 'altavista\.digital\.com' => sub { $a{q}; }, 'altavista\.com' => sub { $a{q}; }, 'altavista\.telia\.com' => sub { $a{q}; }, 'altavista\.magallanes' => sub { $a{q}; }, 'altavista\.yellowpages\.com' => sub { $a{q}; }, 'infind\.inference\.com' => sub { $a{query}; }, 'ahoy\.cs\.washington\.edu' => sub { "$a{first}+$a{last}"; }, 'www\.northernlight\.com' => sub { $a{qr}; }, 'www\.goto\.com' => sub { $a{Keywords}; }, 'www\.realnames\.com' => sub { $a{realName}; }, 'search\.mit\.edu' => sub { $a{qt}; }, 'google\.netscape' => sub { $a{query}; }, 'home\.cnet' => sub { $a{qt}; }, 'infoseek\.go' => sub { $a{qt}; }, 'search\.cnet' => sub { $a{QUERY}; }, 'search\.go2net' => sub { $a{general}; }, 'search\.moonport' => sub { $a{q}; }, 'www\.alltheweb' => sub { $a{query}; }, 'www\.go\.com' => sub { $a{qt}; }, 'www\.google' => sub { $a{q}; }, 'savvysearch\.com' => sub { $a{q}; }, ); $engineSubs = \%engineMap; $parsedQueries = 0; $unparsedQueries = 0; while (<>) { # first parse out the relevant fields from the logfile ($host, $time, $req, $code, $length, $ref) = ($_ =~ /^(\S+) - - \[(.+)\] "(.+)" (\S+) (\S+) "?(\S+)"?/); ($url) = ($req =~ /^\w+ (\S+)/); # Check if it's a URL we're interested in and then see if the referrer # is a search. Any referrer with a ? is a candidate. if ($url =~ /$searchURL/ && $ref =~ /\?/) { # ok, we have a candidate. Now unpack the search engine name and # the particular request ($engine, $req) = ($ref =~ /(.*)\?(.*)/); # hack around any weird search engines # looksmart has fields in the CGI request without values if ($engine =~ /looksmart/) { # hack! if ($debug) { print STDERR "Fixing up looksmart request $req\n"; } # strip fields with no values like ?l&q=foo and ?bar=&q=foo $req =~ s/^[^=]+(=&|&)//; } # now unpack the request into the %a hash. %a = split /[=&]/, $req; # and pass it through the table to extract the search term $search = applytable($engine, $engineSubs); if ($search) { # convert the +s to spaces in the search term $search =~ s/\+/ /g; # and convert all the %xx stuff back into proper characters $search =~ s/%(\w\w)/chr(hex($1))/eg; # and print out the exciting information! print "$url $search\n"; $parsedQueries++; } else { if ($debug) { print STDERR ("Unknown engine $engine -> $req\n"); } $unparsedQueries++; } } } print "~~~ $parsedQueries queries successfully parsed, $unparsedQueries weren't understood.\n";