#!/usr/local/bin/perl #################################################### # Written by Gantug Damdinsuren May 09, 2002 # Modified: October 16, 2002 # gantug@mongoliaphotogallery.com # # This CGI script searches the directory and its subdirectories for the # key word(s) the user typed in. And it returns the filename as the link # if the key word(s) was found in the file. The link is sorted in the order # based on the number of times the key word was found in the html file # (The largest number comes first) # One more thing, the script searches key words only in html files. #################################################### use CGI qw(:standard); # following line will be concatenated at the beginning of # each link to the file that contains any match my $url = "YOUR_URL_SHOULD_BE_HERE"; my $directory_name = "."; my $search_word = param("search_word"); # open an input pipe to find all the files in the current directory and its # subdirectories. It then prints the names of any text files which # contain any of the words on the command line. The open statement # opens a pipe from the UNIX "find" program, which will traverse # the current directory and all subdirectories and prints out all the # filenames it finds. You can replace the value of $directory_name # to search in other directory open(FIND, "find $directory_name -print |")|| die "Couldn't run find program: $!\n"; # Tell the browser that the incoming data is html file... # if search string is empty, warn the user if ($search_word eq ""){ &dienice("There is no word(s) to search. Please enter search word(s) and try again."); } # check if the user entry is only one or more word characters including space if ($search_word !~ /^[\w\s]+$/) { &dienice("Search words must consist of word characters (letters and digits) only. No punctuations are allowed. Please revise your search word(s) and try again.
Please also make sure that there is no space before the first search word."); } # Get rid of \r, \n, ^M, \t, and \s from the search_word field. @search_words = split(/[\r\t\s\cM\n]/, $search_word); $found = 0; # flag if the word was found. remains 0 if was not found FILE: # loop through each file name in the dierctory while ($filename = ) { chop $filename; # if the file is not text/html file, skip the iteration if ((! -T $filename) || ($filename !~/.html$/)) { next FILE; } # open the text file if (!open(TEXTFILE, $filename)) { print "Can't open $filename - continuing...\n"; next FILE; } while () { foreach $word (@search_words) { if (/\W$word\W/i) { # create a hash where the key is the user typed key # word and the valus is the count of the number of # times the keyword was found in the line of the text. $counter{$word}++; $found = 1; } } } # create an array. Each element is the concatenation of the wollowing: # file name, key word searched, and the count of the times the word # was found in file. # this array is used to generate final report while (($word_searched, $times) = each(%counter)) { $match = $filename . " " . $word_searched . " " . $times; push(@matches, $match); } # empty the %counter hash for the next iteration of loop %counter = (); } if ($found == 0) { &dienice("The search word was not found anywhere."); } # Prepare to generate the final result foreach $line (@matches) { ($name_of_file, $key_word, $number) = split(/ /, $line); $last_slash = rindex($name_of_file, "/"); # define the last slash # character location $head = substr($name_of_file, 0, $last_slash+1); # include last # slash character $tail = substr($name_of_file, $last_slash+1); # exclude last # slash character $path_part = substr($head, 1); # get rid of first dot # (".")character from # the file name path returned by the # UNIX find program. (Unix find program # returns the filename with this format: # ./usr/bin/perl ) $full_url = $url . $path_part . $tail; # this will be used in # 'a href' tag in the report $word_uppercased = "\U$key_word"; # uppercase the search word # create a line of string to be printed as report. each line starts with the # digit indicating # how many times the word was found in this file. Pound sign is a mere # delimiter. We are # doing this just for sorting purpose. Later we will get rid of this part # before we # generate actual final report to the browser $print_line = $number . "#" . "
$word_uppercased
$number
"; # create an array where each of above line is an individual element push(@print_lines, $print_line); } # sort the array numerically in reverse (largest number of match goes first) @sorted_print_lines = sort {$b <=> $a} @print_lines; # get rid of digits (value of $times) and pound sign we appended earlier to each # line for sorting purpose foreach (@sorted_print_lines) { $pound_sign = index($_, "#"); $final_line_to_print = substr($_, $pound_sign+1); # trim head of line including # the pound sign # push back to an array with each element is trimmed from the beginning push(@final_lines_to_print, $final_line_to_print); } # Now, Let's Roll!!! here you go. Final result of the search ;-) print header(); print <<"END_HEADER"; Search Result

Here is Your Search Result:

END_HEADER # here is actual array that contains each result print "@final_lines_to_print\n"; print <<"END_FOOTER";
Search Word
Number of Times Found
Name of the File that Contains Search Word

 


Columbia, MO 2002

This search CGI script was written by Gantug Damdinsuren

 

 

END_FOOTER sub dienice { my($msg) = @_; print "Content-type: text/html\n\n"; print <<"END_DIENICE"; Something Goes Wrong!
KOMU TV

Something is wrong with the information you submitted:

$msg


Columbia, MO 2002
END_DIENICE exit; }