#!/usr/bin/perl
# full-text in-files search for my website using a mirror of the library/dir/structure/and/files.pdf
# where the files.pdf is actually a pre-computed plaintext copy of the (pdf/djvu/epub) with a false extension.
# basically a POE event loop that tails a webserver log looking for GETs ending in .search that then calls File::Find
# find() to find find the parsed out query in every file contents and generate a .html page of results.
# I hope you appreciate this documentation future me.
use strict;
use warnings;
use File::Find;
use POE qw(Wheel::FollowTail); # POE is the best module ever
use POSIX qw(strftime);
use IO::Handle; # for auto-flush to get the open() write results on disk as a file asap
# required binaries: pdftotext, djvutxt, ebook-convert (from calibre)
#use URI::Escape;
my $debugon = 1; # spammy in console if 1, quiet if 0.
# webserver log | file on disk | this script generates these .html files
# user request static page >search results >>appended indices
# GET /library/Physics/find-some search here.search -> searched.html -> search-done.html -> search-log.html
# |
# |--> $savedquerywithhyphens-$savedsearchpath.html
### How to set up the plain text "mirror" of all the directories and files.
## Generate copy of directory structure (without files) to populate with the text cache first...
# rsync -a -f"+ */" -f"- *" /home/superkuh/library/ /home/superkuh/gob/librarytextcache/
## Optionally run textcache-generate.pl to pre-create the mirror files with just plain text inside them. This runs then stops.
# ./textcache-generate.pl
## Now this script thurstonia-textcache.pl is ready to operate and search. nginx doesn't need to know about it. This runs forever.
# ./thurstonia-textcache.pl
### nginx required config for .search ending URLs to redirect to the actual search output .html path.
## in mime.types add in "search" to text/html
#types {
# text/html html htm shtml lol search;
## Optionally set up an nginx limit rate to not get overloaded with searches
#limit_req_zone $binary_remote_addr zone=search:1m rate=30r/m;
## nginx location based match for .search that redirects to the intermediate search page /hello/searched.html which meta-refreshes to $htmloutfullpath
# location ~ .*?\.search {
# add_header x-search-library "Nice. Thanks for trying my search, dude.";
# limit_req zone=search nodelay;
# rewrite .*?\.search$ /hello/searched.html last;
# }
## searched.html for the intermediate "waiting page" and redirect to results after it is created by this perl script.
## basically anything with the line will work.
#
#
#
#
#
# search commencing...
#
#
#
#
#
#
Search commencing... please wait 4 seconds. This page will automatically redirect.
#
If the coming results page is blank: refresh after a second or two more.
#
Search results will update progressively. Keep refreshing if slow.
#
#
my $LIBRARYDIR='/home/superkuh/library/'; # real directory with real files
my $GHOSTDIR='/stuff/ghost-library/'; # (not used anymore) soft link "mirror" of the dirs and files
my $PUBLICDIR='/stuff/public-library/'; # (not used anymore) rsync filtered (dmca claims, etc) copy of the $GHOSTDIR that nginx servs as /library/
my $TEXTCACHEDIR='/home/superkuh/gob/librarytextcache'; # mirror of $LIBRARYDIR but only plaintext copies: odd man out without a trailing slash "/" -- watch this
my $webroot = '/home/superkuh/www'; # no trailing slash
my $websubdir = '/hello/'; # the trailing slash comes from this, even if just '/'
my $htmloutfullpath = "$webroot$websubdir" . "search-done.html"; # the file/page where the results are shown
#my $htmloutfullpath = '/home/superkuh/www/hello/search-done.html'; # the file/page where the results are shown
my $logoutfullpath = "$webroot$websubdir" . "search-log.html"; # index of all old searches and results.
#my $logoutfullpath = '/home/superkuh/www/hello/search-log.html'; # index of all old searches and results.
# Get the current time at script launch for no purpose at all.
my $current_time = time;
# Format the date and time as a string
my $datehere = strftime("%Y-%m-%d %H:%M:%S", localtime($current_time));
print "The current date and time is: $datehere\n" if $debugon;
POE::Session->create(
inline_states => {
_start => sub {
$_[HEAP]{tailor} = POE::Wheel::FollowTail->new(
Filename => "/var/log/nginx/access.log",
InputEvent => "got_log_line",
ResetEvent => "got_log_rollover",
);
},
got_log_line => sub {
#print "Log: $_[ARG0]\n";
parseline($_[ARG0]);
},
got_log_rollover => sub {
#print "Log rolled over.\n";
},
}
);
POE::Kernel->run();
exit;
sub parseline { # inconsistent indenting for this sub after the raw HTML output section
my $line = shift;
my ($path,$query,$htmlfilename,$fulltext);
#91.243.93.233 - - [17/Apr/2024:13:40:49 -0500] "GET /library/find-neutron.search HTTP/1.1" 200 2698 "http://superkuh.com/library/find-neutron.search" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.1484.47 Safari/537.36"
# query: find-neutron.search, path: GET, fulltext: neutron, html:
$line =~ s/\%20/ /g;
$line =~ m#GET /library/(.*?)(find-(\w+\s?\w+?\s?\w+?\s?\w+?)+\.search(\+?)) HTTP#; # only 4 words? well it is exact match...
$path = $1;
$htmlfilename = $2; # everything else (formerly: what to output as a .search html file, not done anymore but the name remains)
#$query = $3;
#$fulltext = $4; # wtf is this for \.search(\+?) there's nothing after .search! Why match it and call it "fulltext"? Ahh, I see .search+ was fulltext, .search name only
$fulltext = 1;
# add the filename only search *fast* mode (not full text)
#$line =~ s/\%20/ /g;
#$line =~ m#GET /library/(.*?)((find|name)-(\w+\s?\w+?\s?\w+?\s?\w+?)+\.search(\+?)) HTTP#; # only 4 words? well it is exact match...
#$path = $1;
#$htmlfilename = $2; # everything else (formerly: what to output as a .search html file, not done anymore but the name remains)
#my $modefindname = $3;
#if ($modefindname eq 'name') { # lets just put this global config variable that could break everything but isn't really used right....here.
# $fulltext = 0;
#} else {
# $fulltext = 1;
#}
# sanitize $path too in case someone makes up a nasty one and nginx still puts it in the access.log instead of error.log
my $ok_chars2 = 'a-zA-Z0-9\,\-.\s\?\"\':/_';
$path =~ s/[^$ok_chars2]//go if $path;
# now with the text cache it's fast enough to search the full thing maybe? Might remove this, but what of $path being empty?
# don't allow searching the full library with no path. Generate a no results page and exit (return) from the subroutine.
unless (defined $path) {
$fulltext = 0;
return;
}
next unless $htmlfilename;
$htmlfilename =~ /(find-)(.+)(\.search)/;
$query = $2; # probably should sanitize this
# dual mode version
#$htmlfilename =~ /(find-|name-)(.+)(\.search)/;
#$query = $2; # probably should sanitize this
# http://insecure.org/news/P55-07.txt
$query =~ s/\0//g;
$query =~ s/([\&;\`'\\\|"*?~<>^\(\)\[\]\{\}\$\n\r])/\\$1/g;
$query =~ s/(\|)/\\$1/g;
my $ok_chars = 'a-zA-Z0-9\,\-.\s\?\"\':/_';
$query =~ s/[^$ok_chars]//go;
#my $sanatizedquery = $query;
#$sanatizedquery =~ s/[^$ok_chars]//go;
print "\nquery: $query, path: $path, fulltext: $fulltext, html: $htmlfilename\n";
next unless $query;
#my $searchpath = '/home/superkuh/library/' . $path;
my $searchpath = $LIBRARYDIR . $path;
# this all never seemed to show up so I guess it's not needed.
#open (HTMLOUT, ">$htmloutfullpath") or die "Can't write to $htmloutfullpath\n$!";
#print HTMLOUT "Please wait a bit while the search is performed. Sorry for the delay.";
#close HTMLOUT;
print "fullpath: $htmloutfullpath";
open (HTMLOUT, ">$htmloutfullpath") or die "Can't write to $htmloutfullpath\n$!";
HTMLOUT->autoflush(1);
print HTMLOUT '
search results
Previous search results indices.";
## like here, if path is empty (searching full library, /) then there'll be no text for the link
#my $justincasepath = $path;
#unless ($justincasepath) {
# $justincasepath = "/";
#}
#print HTMLOUT "\n
For the query \"$query\" in the directory $justincasepath, these results were found (so far, refresh for progress):
\n\n";
print HTMLOUT "\n
For the query \"$query\" in the directory $path, these results were found (so far, refresh for progress):
\n\n";
undef $/;
# seemingly find returns the entire directory of files (recursively) one by one passing to this anonymous sub which gets the name and sees if it matches the query string
# ignoring directorys and things not -r? what is -r? anyway, then it opens the file, reads it as a string, and tries to match for the query in the file's string.
find( sub
{
return if($_ =~ /^\./);
return unless($_ =~ /\.(pdf|djvu|txt|jpg|gif|jpeg|png|gif|mov|mpg|mp4|mpeg|avi|mkv|ogg|wav|mp3|tar|7z|rar|gz|fits|bz2|pdb|html|epub|mht|cbz|doc|lit|7z|exe|ps|chm|awesome)/i);
stat $File::Find::name;
return if -d;
return unless -r;
# open(FILE, "< $File::Find::name") or return;
# my $string = ;
# close (FILE);
my $string;
my $tempfilename = $File::Find::name;
# check if the file already has been converted in the text only library mirror
my $existcheckfilepath = $tempfilename;
$existcheckfilepath =~ s/$LIBRARYDIR/$TEXTCACHEDIR\//; # see that trailing slash? It's needed
#print "Debug: \$existcheckfilepath = $existcheckfilepath\n" if $debugon;
if (-e $existcheckfilepath) {
print "File $existcheckfilepath exists in text library cache.\n" if $debugon;
open(FILE, "< $existcheckfilepath") or return;
$string = do { local $/; };
close (FILE);
} else { # convert on the fly and save to text library cache for future
if ($File::Find::name =~ /pdf$/i and $fulltext) {
my $pdffilename = $tempfilename;
print "\npdftotext \"$pdffilename\" -\n";
$string = `pdftotext \"$pdffilename\" -`;
} elsif ($File::Find::name =~ /djvu$/i and $fulltext) {
print "\ndjvutxt \"$tempfilename\"\n";
$string = `djvutxt \"$tempfilename\"`;
} elsif ($File::Find::name =~ /epub$/i and $fulltext) {
print "\nebook-convert \"$tempfilename\"\n";
`ebook-convert \"$tempfilename\" /tmp/oogieboogie.txt`;
open (FILE, "/tmp/oogieboogie.txt") or die "Can't open /tmp/oogieboogie.txt: $!\n";
$string = do { local $/; };
close (FILE);
} else { # fuck it, we'll do it live (hope it's text or html or doc or some fragments of ascii bytes line up to magically make characters)
my $max_size = 75 * 1024 * 1024; # bytes, 75MB
my @filestats = stat($File::Find::name);
# only do raw read if the file is under 75MB
if ($filestats[7] < $max_size) {
print "\nrawread \"$tempfilename\"\n";
open(FILE, "< $File::Find::name") or return;
#open(FILE, '<:bytes', $File::Find::name, 0, $max_size); # no return, only the first 75MB
$string = do { local $/; };
close (FILE);
} else {
$string = 'too big';
}
}
# add to $TEXTCACHEDIR\/ automatically by writing plain text to $existcheckfilepath
# !!! BUT: What if the directory in $existcheckfilepath doesn't exist? Creating the file will fail. Have to check and create path. :\
#use Fcntl qw(:mode); then recusively make dirs see makepath.pl in ~/tests/
#maybe put the open in a if(){} else {} and put the path creation in the else (with a second try at open too)
print "TEXT LIBRARY FILE CREATION: $existcheckfilepath\n\n" if $debugon;
open(TEXTCOPYOFFILE, "> $existcheckfilepath") or warn "Crap, can't create $existcheckfilepath\n"; # don't stop on errors
print TEXTCOPYOFFILE $string;
close (TEXTCOPYOFFILE);
} # if exists else close bracket
##return unless ($string =~ /\Q$query\E/i); # old only finds in raw text, but encoding blocks many matches
return unless ($string =~ /\Q$query\E/i) || ($File::Find::name =~ /$query/i); # new checks for that but also just the file path/file name
# Maybe change this regex to also get a bit of the surrounding text !!! EXCEPT THIS BREAKS EVERYTHING BY RETURNING A BLANK search-done.html for first load
# Also, this is slower?
# my $precontext;
# my $postcontext;
# my $mousehovertextforcontext;
# #if ($string =~ /(.{0,40})\Q($query)\E(.{0,40})/i || ($File::Find::name =~ /$query/i)) {
# if ($string =~ /(.{0,40})?\Q($query)\E(.{0,40})?/i || ($File::Find::name =~ /$query/i)) {
# $precontext = $1 // '';
# $postcontext = $3 // '';
# print "precontext: $precontext, postcontext: $postcontext\n" if $debugon;
# $mousehovertextforcontext = $precontext . $query . $postcontext;
# $mousehovertextforcontext =~ s/[^$ok_chars2]//go;
# } else {
# return;
# }
# in this context, it's document filename, like, "Optical System Design_ 2nd Ed_ Robert F Fischer_ 2008.pdf"
my $barefilename = $_; # $_ here is the filename from File::Find find() we're inside (no filepath)
# /home/superkuh/Library/000-Bio And Chemistry/000-Diseases/H5N1/summitusda.pdf
#$File::Find::name =~ m#/home/superkuh/library/(.*)#;
$File::Find::name =~ m#$LIBRARYDIR(.*)#;
my $webpath = '/library/' . $1;
my $filesize = niceSize(-s $File::Find::name);
$webpath =~ s/\?/%3F/g; # ? to url encoded ?.
#my $encoded_webpath = uri_escape($webpath); # cover it all, not just ? (edit, later: yeah, this breaks the URLs)
$webpath =~ s/\!/%21/g;
$webpath =~ s/\$/%24/g;
$webpath =~ s/\&/%26/g;
##print HTMLOUT "
\n";
#flush(HTMLOUT) or die "Can't flush filehandle: $!";
}, # find() anonymous subroutine end
"$searchpath"); # find() function call end
# html footer for search
print HTMLOUT "\n\n
Only searching subdirectories is allowed. /library/ full searches are disabled. To search something else, just add \"/find-your search terms.search\" to any directory in the library.
\n
\"\/find-a few words.search\", when added to any URL in the Library subdirectory will return search results for \"a few words\" after a delay. If there is anything not alphanumberic in the string the log parser won\'t even trigger the search. No caching or pre-conversion to text is implemented yet so it has to convert every time and it's really slow, as you can see.
\n\n";
close HTMLOUT or warn "$!";
# Append this search to an index previous searches in a .html file and copy the results to the archive search filename(s).
# It just appends a bunch of
search item line
to the html. So anything put at the top as a header is fine in $logoutfullpath .html
my $savedquerywithhyphens = $query;
$savedquerywithhyphens =~ s/\s/-/g;
my $savedsearchpath = $path;
## This needs an update if there's no directory for full library search, using $justincasepath from above then...
# if ($justincasepath == '/') {
# $savedsearchpath = "library";
# }
$savedsearchpath =~ s/\//-/g;
$savedsearchpath =~ s/\s/-/g;
$savedsearchpath =~ s/-$//;
#my $savedsearchfullpath = "/home/superkuh/www/hello/$savedquerywithhyphens-$savedsearchpath.html";
my $seperator = 'bo_op'; # I have my reasons.
my $savedsearchfullpath = "$webroot$websubdir" . "$savedquerywithhyphens-$seperator-$savedsearchpath.html";
print "DEBUG: cp $htmloutfullpath $savedsearchfullpath\n";
`cp $htmloutfullpath $savedsearchfullpath`;
open (LOGOUT, ">>$logoutfullpath") or die "Can't write to $logoutfullpath\n$!";
$datehere = strftime("%Y-%m-%d %H:%M:%S", localtime($current_time));
#print LOGOUT "
\n";
close LOGOUT;
} # parseline event end
#query: RETGEM, path: Physics/Particle Detection/, fulltext: 1, html: find-RETGEM.search
#DEBUG: cp /home/superkuh/www/hello/search-done.html /home/superkuh/www/hello/RETGEM-PhysicsParticle Detection.html
#cp: target `Detection.html' is not a directory
sub niceSize {
# Will work up to considerable file sizes!
my $fs = $_[0]; # First variable is the size in bytes
#my $dp = $_[1]; # Number of decimal places required
my $dp = 1; # Number of decimal places required
my @units = ('bytes','kB','MB','GB','TB','PB','EB','ZB','YB');
my $u = 0;
$dp = ($dp > 0) ? 10**$dp : 1;
while($fs > 1024){
$fs /= 1024;
$u++;
}
if($units[$u]){ return (int($fs*$dp)/$dp)." ".$units[$u]; } else{ return int($fs); }
}