#!/usr/bin/perl # full-text in-files search for my website using a mirror of the library/dir/structure/and/files.pdf # where the files.pdf is actually a pre-computed plaintext copy of the (pdf/djvu/epub) with a false extension. # basically a POE event loop that tails a webserver log looking for GETs ending in .search that then calls File::Find # find() to find find the parsed out query in every file contents and generate a .html page of results. # I hope you appreciate this documentation future me. use strict; use warnings; use File::Find; use POE qw(Wheel::FollowTail); # POE is the best module ever use POSIX qw(strftime); use IO::Handle; # for auto-flush to get the open() write results on disk as a file asap # required binaries: pdftotext, djvutxt, ebook-convert (from calibre) #use URI::Escape; my $debugon = 1; # spammy in console if 1, quiet if 0. # webserver log | file on disk | this script generates these .html files # user request static page >search results >>appended indices # GET /library/Physics/find-some search here.search -> searched.html -> search-done.html -> search-log.html # | # |--> $savedquerywithhyphens-$savedsearchpath.html ### How to set up the plain text "mirror" of all the directories and files. ## Generate copy of directory structure (without files) to populate with the text cache first... # rsync -a -f"+ */" -f"- *" /home/superkuh/library/ /home/superkuh/gob/librarytextcache/ ## Optionally run textcache-generate.pl to pre-create the mirror files with just plain text inside them. This runs then stops. # ./textcache-generate.pl ## Now this script thurstonia-textcache.pl is ready to operate and search. nginx doesn't need to know about it. This runs forever. # ./thurstonia-textcache.pl ### nginx required config for .search ending URLs to redirect to the actual search output .html path. ## in mime.types add in "search" to text/html #types { # text/html html htm shtml lol search; ## Optionally set up an nginx limit rate to not get overloaded with searches #limit_req_zone $binary_remote_addr zone=search:1m rate=30r/m; ## nginx location based match for .search that redirects to the intermediate search page /hello/searched.html which meta-refreshes to $htmloutfullpath # location ~ .*?\.search { # add_header x-search-library "Nice. Thanks for trying my search, dude."; # limit_req zone=search nodelay; # rewrite .*?\.search$ /hello/searched.html last; # } ## searched.html for the intermediate "waiting page" and redirect to results after it is created by this perl script. ## basically anything with the line will work. # # # # # # search commencing... # # #
# #
#

Search commencing... please wait 4 seconds. This page will automatically redirect.

#

If the coming results page is blank: refresh after a second or two more.

#

Search results will update progressively. Keep refreshing if slow.

#
# my $LIBRARYDIR='/home/superkuh/library/'; # real directory with real files my $GHOSTDIR='/stuff/ghost-library/'; # (not used anymore) soft link "mirror" of the dirs and files my $PUBLICDIR='/stuff/public-library/'; # (not used anymore) rsync filtered (dmca claims, etc) copy of the $GHOSTDIR that nginx servs as /library/ my $TEXTCACHEDIR='/home/superkuh/gob/librarytextcache'; # mirror of $LIBRARYDIR but only plaintext copies: odd man out without a trailing slash "/" -- watch this my $webroot = '/home/superkuh/www'; # no trailing slash my $websubdir = '/hello/'; # the trailing slash comes from this, even if just '/' my $htmloutfullpath = "$webroot$websubdir" . "search-done.html"; # the file/page where the results are shown #my $htmloutfullpath = '/home/superkuh/www/hello/search-done.html'; # the file/page where the results are shown my $logoutfullpath = "$webroot$websubdir" . "search-log.html"; # index of all old searches and results. #my $logoutfullpath = '/home/superkuh/www/hello/search-log.html'; # index of all old searches and results. # Get the current time at script launch for no purpose at all. my $current_time = time; # Format the date and time as a string my $datehere = strftime("%Y-%m-%d %H:%M:%S", localtime($current_time)); print "The current date and time is: $datehere\n" if $debugon; POE::Session->create( inline_states => { _start => sub { $_[HEAP]{tailor} = POE::Wheel::FollowTail->new( Filename => "/var/log/nginx/access.log", InputEvent => "got_log_line", ResetEvent => "got_log_rollover", ); }, got_log_line => sub { #print "Log: $_[ARG0]\n"; parseline($_[ARG0]); }, got_log_rollover => sub { #print "Log rolled over.\n"; }, } ); POE::Kernel->run(); exit; sub parseline { # inconsistent indenting for this sub after the raw HTML output section my $line = shift; my ($path,$query,$htmlfilename,$fulltext); #91.243.93.233 - - [17/Apr/2024:13:40:49 -0500] "GET /library/find-neutron.search HTTP/1.1" 200 2698 "http://superkuh.com/library/find-neutron.search" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.1484.47 Safari/537.36" # query: find-neutron.search, path: GET, fulltext: neutron, html: $line =~ s/\%20/ /g; $line =~ m#GET /library/(.*?)(find-(\w+\s?\w+?\s?\w+?\s?\w+?)+\.search(\+?)) HTTP#; # only 4 words? well it is exact match... $path = $1; $htmlfilename = $2; # everything else (formerly: what to output as a .search html file, not done anymore but the name remains) #$query = $3; #$fulltext = $4; # wtf is this for \.search(\+?) there's nothing after .search! Why match it and call it "fulltext"? Ahh, I see .search+ was fulltext, .search name only $fulltext = 1; # add the filename only search *fast* mode (not full text) #$line =~ s/\%20/ /g; #$line =~ m#GET /library/(.*?)((find|name)-(\w+\s?\w+?\s?\w+?\s?\w+?)+\.search(\+?)) HTTP#; # only 4 words? well it is exact match... #$path = $1; #$htmlfilename = $2; # everything else (formerly: what to output as a .search html file, not done anymore but the name remains) #my $modefindname = $3; #if ($modefindname eq 'name') { # lets just put this global config variable that could break everything but isn't really used right....here. # $fulltext = 0; #} else { # $fulltext = 1; #} # sanitize $path too in case someone makes up a nasty one and nginx still puts it in the access.log instead of error.log my $ok_chars2 = 'a-zA-Z0-9\,\-.\s\?\"\':/_'; $path =~ s/[^$ok_chars2]//go if $path; # now with the text cache it's fast enough to search the full thing maybe? Might remove this, but what of $path being empty? # don't allow searching the full library with no path. Generate a no results page and exit (return) from the subroutine. unless (defined $path) { $fulltext = 0; return; } next unless $htmlfilename; $htmlfilename =~ /(find-)(.+)(\.search)/; $query = $2; # probably should sanitize this # dual mode version #$htmlfilename =~ /(find-|name-)(.+)(\.search)/; #$query = $2; # probably should sanitize this # http://insecure.org/news/P55-07.txt $query =~ s/\0//g; $query =~ s/([\&;\`'\\\|"*?~<>^\(\)\[\]\{\}\$\n\r])/\\$1/g; $query =~ s/(\|)/\\$1/g; my $ok_chars = 'a-zA-Z0-9\,\-.\s\?\"\':/_'; $query =~ s/[^$ok_chars]//go; #my $sanatizedquery = $query; #$sanatizedquery =~ s/[^$ok_chars]//go; print "\nquery: $query, path: $path, fulltext: $fulltext, html: $htmlfilename\n"; next unless $query; #my $searchpath = '/home/superkuh/library/' . $path; my $searchpath = $LIBRARYDIR . $path; # this all never seemed to show up so I guess it's not needed. #open (HTMLOUT, ">$htmloutfullpath") or die "Can't write to $htmloutfullpath\n$!"; #print HTMLOUT "Please wait a bit while the search is performed. Sorry for the delay."; #close HTMLOUT; print "fullpath: $htmloutfullpath"; open (HTMLOUT, ">$htmloutfullpath") or die "Can't write to $htmloutfullpath\n$!"; HTMLOUT->autoflush(1); print HTMLOUT ' search results

superkuh\'s half-assed full-text library sub-directory search

'; #print HTMLOUT "\n

Previous search results indices."; print HTMLOUT "\n

Previous search results indices."; ## like here, if path is empty (searching full library, /) then there'll be no text for the link #my $justincasepath = $path; #unless ($justincasepath) { # $justincasepath = "/"; #} #print HTMLOUT "\n

For the query \"$query\" in the directory $justincasepath, these results were found (so far, refresh for progress):

\n
    \n"; print HTMLOUT "\n

    For the query \"$query\" in the directory $path, these results were found (so far, refresh for progress):

    \n
      \n"; undef $/; # seemingly find returns the entire directory of files (recursively) one by one passing to this anonymous sub which gets the name and sees if it matches the query string # ignoring directorys and things not -r? what is -r? anyway, then it opens the file, reads it as a string, and tries to match for the query in the file's string. find( sub { return if($_ =~ /^\./); return unless($_ =~ /\.(pdf|djvu|txt|jpg|gif|jpeg|png|gif|mov|mpg|mp4|mpeg|avi|mkv|ogg|wav|mp3|tar|7z|rar|gz|fits|bz2|pdb|html|epub|mht|cbz|doc|lit|7z|exe|ps|chm|awesome)/i); stat $File::Find::name; return if -d; return unless -r; # open(FILE, "< $File::Find::name") or return; # my $string = ; # close (FILE); my $string; my $tempfilename = $File::Find::name; # check if the file already has been converted in the text only library mirror my $existcheckfilepath = $tempfilename; $existcheckfilepath =~ s/$LIBRARYDIR/$TEXTCACHEDIR\//; # see that trailing slash? It's needed #print "Debug: \$existcheckfilepath = $existcheckfilepath\n" if $debugon; if (-e $existcheckfilepath) { print "File $existcheckfilepath exists in text library cache.\n" if $debugon; open(FILE, "< $existcheckfilepath") or return; $string = do { local $/; }; close (FILE); } else { # convert on the fly and save to text library cache for future if ($File::Find::name =~ /pdf$/i and $fulltext) { my $pdffilename = $tempfilename; print "\npdftotext \"$pdffilename\" -\n"; $string = `pdftotext \"$pdffilename\" -`; } elsif ($File::Find::name =~ /djvu$/i and $fulltext) { print "\ndjvutxt \"$tempfilename\"\n"; $string = `djvutxt \"$tempfilename\"`; } elsif ($File::Find::name =~ /epub$/i and $fulltext) { print "\nebook-convert \"$tempfilename\"\n"; `ebook-convert \"$tempfilename\" /tmp/oogieboogie.txt`; open (FILE, "/tmp/oogieboogie.txt") or die "Can't open /tmp/oogieboogie.txt: $!\n"; $string = do { local $/; }; close (FILE); } else { # fuck it, we'll do it live (hope it's text or html or doc or some fragments of ascii bytes line up to magically make characters) my $max_size = 75 * 1024 * 1024; # bytes, 75MB my @filestats = stat($File::Find::name); # only do raw read if the file is under 75MB if ($filestats[7] < $max_size) { print "\nrawread \"$tempfilename\"\n"; open(FILE, "< $File::Find::name") or return; #open(FILE, '<:bytes', $File::Find::name, 0, $max_size); # no return, only the first 75MB $string = do { local $/; }; close (FILE); } else { $string = 'too big'; } } # add to $TEXTCACHEDIR\/ automatically by writing plain text to $existcheckfilepath # !!! BUT: What if the directory in $existcheckfilepath doesn't exist? Creating the file will fail. Have to check and create path. :\ #use Fcntl qw(:mode); then recusively make dirs see makepath.pl in ~/tests/ #maybe put the open in a if(){} else {} and put the path creation in the else (with a second try at open too) print "TEXT LIBRARY FILE CREATION: $existcheckfilepath\n\n" if $debugon; open(TEXTCOPYOFFILE, "> $existcheckfilepath") or warn "Crap, can't create $existcheckfilepath\n"; # don't stop on errors print TEXTCOPYOFFILE $string; close (TEXTCOPYOFFILE); } # if exists else close bracket ##return unless ($string =~ /\Q$query\E/i); # old only finds in raw text, but encoding blocks many matches return unless ($string =~ /\Q$query\E/i) || ($File::Find::name =~ /$query/i); # new checks for that but also just the file path/file name # Maybe change this regex to also get a bit of the surrounding text !!! EXCEPT THIS BREAKS EVERYTHING BY RETURNING A BLANK search-done.html for first load # Also, this is slower? # my $precontext; # my $postcontext; # my $mousehovertextforcontext; # #if ($string =~ /(.{0,40})\Q($query)\E(.{0,40})/i || ($File::Find::name =~ /$query/i)) { # if ($string =~ /(.{0,40})?\Q($query)\E(.{0,40})?/i || ($File::Find::name =~ /$query/i)) { # $precontext = $1 // ''; # $postcontext = $3 // ''; # print "precontext: $precontext, postcontext: $postcontext\n" if $debugon; # $mousehovertextforcontext = $precontext . $query . $postcontext; # $mousehovertextforcontext =~ s/[^$ok_chars2]//go; # } else { # return; # } # in this context, it's document filename, like, "Optical System Design_ 2nd Ed_ Robert F Fischer_ 2008.pdf" my $barefilename = $_; # $_ here is the filename from File::Find find() we're inside (no filepath) # /home/superkuh/Library/000-Bio And Chemistry/000-Diseases/H5N1/summitusda.pdf #$File::Find::name =~ m#/home/superkuh/library/(.*)#; $File::Find::name =~ m#$LIBRARYDIR(.*)#; my $webpath = '/library/' . $1; my $filesize = niceSize(-s $File::Find::name); $webpath =~ s/\?/%3F/g; # ? to url encoded ?. #my $encoded_webpath = uri_escape($webpath); # cover it all, not just ? (edit, later: yeah, this breaks the URLs) $webpath =~ s/\!/%21/g; $webpath =~ s/\$/%24/g; $webpath =~ s/\&/%26/g; ##print HTMLOUT "
    1. $barefilename ($filesize)
    2. \n"; print HTMLOUT "
    3. $barefilename ($filesize)
    4. \n"; #print HTMLOUT "
    5. $barefilename ($filesize)
    6. \n"; #flush(HTMLOUT) or die "Can't flush filehandle: $!"; }, # find() anonymous subroutine end "$searchpath"); # find() function call end # html footer for search print HTMLOUT "
    \n\n

    Only searching subdirectories is allowed. /library/ full searches are disabled. To search something else, just add \"/find-your search terms.search\" to any directory in the library.

    \n

    \"\/find-a few words.search\", when added to any URL in the Library subdirectory will return search results for \"a few words\" after a delay. If there is anything not alphanumberic in the string the log parser won\'t even trigger the search. No caching or pre-conversion to text is implemented yet so it has to convert every time and it's really slow, as you can see.

    \n\n"; close HTMLOUT or warn "$!"; # Append this search to an index previous searches in a .html file and copy the results to the archive search filename(s). # It just appends a bunch of

    search item line

    to the html. So anything put at the top as a header is fine in $logoutfullpath .html my $savedquerywithhyphens = $query; $savedquerywithhyphens =~ s/\s/-/g; my $savedsearchpath = $path; ## This needs an update if there's no directory for full library search, using $justincasepath from above then... # if ($justincasepath == '/') { # $savedsearchpath = "library"; # } $savedsearchpath =~ s/\//-/g; $savedsearchpath =~ s/\s/-/g; $savedsearchpath =~ s/-$//; #my $savedsearchfullpath = "/home/superkuh/www/hello/$savedquerywithhyphens-$savedsearchpath.html"; my $seperator = 'bo_op'; # I have my reasons. my $savedsearchfullpath = "$webroot$websubdir" . "$savedquerywithhyphens-$seperator-$savedsearchpath.html"; print "DEBUG: cp $htmloutfullpath $savedsearchfullpath\n"; `cp $htmloutfullpath $savedsearchfullpath`; open (LOGOUT, ">>$logoutfullpath") or die "Can't write to $logoutfullpath\n$!"; $datehere = strftime("%Y-%m-%d %H:%M:%S", localtime($current_time)); #print LOGOUT "

    $datehere: For query \"$query\" in the directory $path, these results.

    \n"; print LOGOUT "

    $datehere: For query \"$query\" in the directory $path, these results.

    \n"; close LOGOUT; } # parseline event end #query: RETGEM, path: Physics/Particle Detection/, fulltext: 1, html: find-RETGEM.search #DEBUG: cp /home/superkuh/www/hello/search-done.html /home/superkuh/www/hello/RETGEM-PhysicsParticle Detection.html #cp: target `Detection.html' is not a directory sub niceSize { # Will work up to considerable file sizes! my $fs = $_[0]; # First variable is the size in bytes #my $dp = $_[1]; # Number of decimal places required my $dp = 1; # Number of decimal places required my @units = ('bytes','kB','MB','GB','TB','PB','EB','ZB','YB'); my $u = 0; $dp = ($dp > 0) ? 10**$dp : 1; while($fs > 1024){ $fs /= 1024; $u++; } if($units[$u]){ return (int($fs*$dp)/$dp)." ".$units[$u]; } else{ return int($fs); } }