#!/bin/sh # Deny IPs access to website based on undesired activity in the access logs. # Looks at the last 2 access log files, so IP blocks will expire with log rotation. # # Remember to update the $LOG1 scans when you change any $LOG-scan code! # (You don't want problems you fixed coming back when the logs rotate.) # # The current log is scanned in its entirety each time, so changing rules here can # also remove IPS from the blocklist. (The previous logs are re-scanned only once # after they are created, so old blocks will not be removed.) # # The nginx web server software expands the 4xx error space to signal issues with the client's request. # 444 No Response - Used internally to instruct the server to return no # information to the client and close the connection immediately. # (This seems appropriate for people trying to hack my webserver and bots that ignore robots.txt.) # 400 Bad Request # 403 Forbidden # 405 Method Not Allowed # 418 I'm a teapot # 429 Too Many Requests # Unofficial return codes: # 450 Blocked by Windows Parental Controls (Microsoft) # 509 Bandwidth Limit Exceeded (Apache Web Server/cPanel) # # Still mulling what counts as excessive traffic # cut -d':' -f1-3 $LOG # MAXREQS=60? in 1 minute # what about a large photo-gallery page? # cut -d':' -f1-3 $LOG | sed 's/.$//' # MAXREQS=75? in 10 minutes # what about 2 large photo-gallery pages? # cut -d':' -f1-2 $LOG # MAXREQS=200? in 1 hour # The pennsic/2016 directory has 390 photos. That's a valid human request. In 1 minute. # The icons collection has 4000+ 100x100 images; another valid (non-hacker) human request. # REQs is probably useless; the threshold needs to be so # high for legitimate activity that it won't notice hacking. # (If you don't have photo galleries, might MAXREQS=400 be good for you? # (By the time a hacker has sent 400 requests it may be too late for your site.)) # ZEROs will probably catch anything that HACKs will, and ZEROs will catch it sooner. # ZEROs looks for CONNECT|DELETE|PATCH|POST|PUT; HACKs looks for not GET|HEAD, so that might catch more. # ignoring methods OPTIONS and TRACE. # # Use new ipnsl script to add hostname to IP-change lists. 2020/Nov/18 # (More compact output than a 2nd list of found host names.) # "cialis", "levitra", and "viagra" showed up in bogus referrer URLs of an # IP masquerading as GoogleBot. Those may be good reasons to block an IP. # # Allow looking for /.well-known/security.txt; add $ZEROALOW. 2020/Dec/08 # (I don't have one (yet?), but it's not a reason to block an IP.) # See https://securitytxt.org/ . # # "/shell?" in $ZEROLIST needs the "?" quoted "\?". 2020/Dec/25 # # Block requests from "Chrome/79.0.3945.117 Safari/537.36" 2020/Dec/28 # and "Chrome/86.0.4240.111 Safari/537.36" browsers -- 80 high-traffic # IPs (static.#.#.#.#.clients.your-server.de) not identifying search bot. # Block browser "Chrome/85.0.4183.83 Safari/537.36" # Block browser ' like Gecko 4tds562"' 2020/Dec/30 # Block "Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134" 2021/Jan/03 # Block browser "Chrome/70.0.3538.102 Safari/537.36" # Block browser MegaIndex.ru; 2021/Jan/09 # MegaIndex.ru/2.0 ignored robots.txt; also will ignore Crawl-delay > 5s. # Block user-agent "Go-http-client/1.1" 2021/Jan/10 # Looks like Dreamwidth uses "Go-http-client/1.1"; removing it. 2021/Jan/10 # Block user-agent CheckMarkNetwork. 2021/Jan/11 # Block "/jenkins/" and "/manager/html" 2021/Jan/26 # Block '"GET http://' and '"GET https://' 2021/Jan/29 # Block log entries advertising other stites: ' 9501 "http' 2021/Feb/01 # (also) ' 19975 "http' # Block https://gdnplus.com crawler; never checks robots.txt. 2021/Feb/02 # Block IndeedBot 1.1; ignores robots.txt delay. 2021/Feb/05 # Block " - adva "; ignores robots.txt delay. 2021/Feb/07 # Block "Java/1.6.0_04"; never checks robots.txt. 2021/Mar/18 # Block "LanaiBotmarch" -- "known bot" 2021/Mar/19 # https://botsvsbrowsers.org/details/5944237/index.html # Block "?action=" (\quote ?) 2021/Mar/22 # Block "LanaiBotapr1"? 2021/Apr/05 # Block '"LanaiBot' to cover LanaiBotmarch and LanaiBotapr1. 2021/Apr/05 # Block "Chrome/84.0.4147.135 Safari/537.36"; ignores delay. 2021/Apr/11 # Block "/dnscfg.cgi?dns". (\quote ?) 2021/Apr/13 # Block "GoodBot". (= AdBot & LanaiBot) 2021/Apr/18 # https://udger.com/resources/ua-list/bot-detail?bot=GoodBot # Whitelist "facebookexternalhit/". ($ZEROALOW) 2021/May/02 # Block "Chrome/81.0.4044.113"; ignores robots.txt delay. 2021/May/06 # Block "Chrome/74.0.3729.169 Safari/537.36"; ignores delay. 2021/May/24 # Block "Seekport Crawler;" ignores robots.txt delay. 2021/May/26 # Block browser MegaIndex.ru again; script was somehow changed 2021/Jul/06 # Block spider "TinyBotTestUA". Did not back off when I raised 2021/Jul/16 # Crawl-delay from 5 to 60. ## Block browser "http://megaindex.com/crawler" ## dropped # Block "Chrome/91.0.4472.124 Safari/537.36". Hammered me today.2021/Jul/26 # Define $HOME. :-( 2021/Jul/26 # When root runs cron-min-blockips, it can't find my ipnsl script. # Block nu.marginalia.wmsa.edge-crawler -- it's way too busy. 2021/Jul/29 # Tighten block of ' 19975 "http' to 2021/Aug/16 # '"GET / HTTP/1.1" 200 19975 "http'; shorter blocked a valid request. # Block "badbot"; ignores crawl-delay. 2021/Aug/21 # Block "Keybot Translation-Search-Machine"; ignores crawl-delay. 2021/Sep/20 # Block "DataForSeoBot"; hasn't checked robots.txt for 5 hrs. 2021/Sep/30 # Leaving it; it's respecting crawl-delay. # Block Chrome/70.0.3538.102, Chrome/64.0.3282.140, and 2022/Feb/06 # Gecko/20100101 Firefox/63.0 -- used for scraping adhocsiners.org. # "Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134" and # "Chrome/70.0.3538.102 Safari/537.36" were already blocked. # Removed Edge from the first, and removed the new duplicates. # Block Gecko/20100101 Firefox/45"; ignored "Disallow /lj.html" 2022/Feb/27 # and looked for PHP URLs in previous week. # (complete "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45") # Block "python-urllib3/1.26.9"; crawler's purpose is unknown. 2022/May/03 HOME=/home/bchivers # needed to find ipnsl script. MAXREQS=10000 # any kind of traffic from 1 IP in 1 hour MAXHACKS=10 # bad requests (not GET, HEAD) from 1 IP # # HACKS may be redundant with ZERO looking for PATCH|POST|PUT. MAX404S=40 # 404 returns in 10 minutes (could be caused by me blogging a bad URL) #MAXZEROS=0 # zero-tolerance strings in the log - threshold is 1; don't need to count how many. # # including (self-identified?) bots that ignore robots.txt # # Tailor this list for your website. I don't have any PHP or CGI, so I # # treat any request as a hacking attempt. I don't use WordPress, so /wp- # # is someone looking to break in. I don't allow ANY input, so a PATCH, # # POST or PUT is a bad guy. But if YOUR site uses these, don't include # # them as block triggers. #ZEROLIST='|/\.|' # '../photos/yyyy/mm/dd/yyyy.mm.dd-hh.mm.ss.html' is valid #|CheckMarkNetwork ZEROLIST='"GET http://|"GET https://|\"CONNECT |\"DELETE |\"PATCH |\"POST |\"PUT |/\.[a-z]|/admin/|/boaform/|/cgi-bin/|/ckeditor/|/cms/|/data/|/db/|/dbadmin/|/dnscfg.cgi\?dns|/fckeditor/|/images/|/include/|/jenkins/|/joomla/|/manager/html|/mysql/|\.php|/php-myadmin/|/phpmy-admin/|/phpmyadmin/|/phppma/|/phpunit/|/plus/|/pma|/portal/|/shell\?|/scripts/|/sql/|/sqlmanager/|/vendor/|/weblogin|/websql/|wget+|/wordpress/|/wp-|/wp/| 400 0 | 400 166 | 200 9501 "http|"GET / HTTP/1.1" 200 19975 "http|Adsbot/3.1| - adva |"Java/1.6.0_04"|gdnplus.com|IndeedBot|http://megaindex.com/crawler|MegaIndex.ru/|MegaIndex.r24/|Chrome/64.0.3282.140 Safari/537.36|Chrome/70.0.3538.102 Safari/537.36|Chrome/74.0.3729.169 Safari/537.36|Chrome/79.0.3945.117 Safari/537.36|Chrome/81.0.4044.113|Chrome/85.0.4183.83 Safari/537.36|Chrome/84.0.4147.135 Safari/537.36|Chrome/86.0.4240.111 Safari/537.36|Chrome/91.0.4472.124 Safari/537.36|Gecko/20100101 Firefox/63.0|Gecko/20100101 Firefox/45| like Gecko 4tds562"|"LanaiBot|"GoodBot"|"badbot"|Seekport Crawler;|"TinyBotTestUA"|"Keybot Translation-Search-Machine"|"python-urllib3/1.26.9"|nu.marginalia.wmsa.edge-crawler|\?action=' ZEROALOW='GET /\.well-known/security.txt |facebookexternalhit/' WHITELIST='10.0.0.109' # don't block myself when I break things # You could also do $ZEROFILE and $WHITEFILE for easier maintenance. # (And to share your script without revealing what you're looking for.) # (If hackers avoid my $ZERO strings, there's not much left they can attack.) # LOG=/var/log/nginx/access.log # current nginx access log LOG1=/var/log/nginx/access.log.1 # newest rotated access log #LOG2=/var/log/nginx/access.log.2.gz # older rotated logs are compressed, FYI # # IPs making too many requests to my site BADREQS=/var/log/nginx/deny-reqs # high-traffic IPs found in the current log NOWREQS=/var/tmp/nginx-req-IPs # high-traffic IPs found in last check of current log NEWREQS=/var/tmp/nginx-reqs-new # high-traffic IPs added from last check of current log OK_REQS=/var/tmp/nginx-reqs-gone # high-traffic IPs removed from last check of current log BADREQS1=/var/log/nginx/deny-reqs.$LOG1DATE # high-traffic IPs found in the newest rotated log # # IPs trying to hack my site (method is not GET nor HEAD) # # This may be redundant with the ZERO section looking for PATCH|POST|PUT. BADHACKS=/var/log/nginx/deny-hacks # hacking IPs found in the current log NOWHACKS=/var/tmp/nginx-hack-IPs # hacking IPs found in last check of current log NEWHACKS=/var/tmp/nginx-hacks-new # hacking IPs added from last check of current log OK_HACKS=/var/tmp/nginx-hacks-gone # hacking IPs removed from last check of current log BADHACKS1=/var/log/nginx/deny-hacks.$LOG1DATE # hacking IPs found in the newest rotated log # # IPs requesting non-existent pages BAD404S=/var/log/nginx/deny-404s # 404 IPs found in the current log NOW404S=/var/tmp/nginx-404-IPs # 404 IPs found in last check of current log NEW404S=/var/tmp/nginx-404s-new # 404 IPs added from last check of current log OK_404S=/var/tmp/nginx-404s-gone # 404 IPs removed from last check of current log BAD404S1=/var/log/nginx/deny-404s.$LOG1DATE # 404 IPs found in the newest rotated log # # zero-tolerance strings in log, incl Robots/Spiders ignoring my robots.txt BADZEROS=/var/log/nginx/deny-zeros # zero-tolerance IPs found in the current log NOWZEROS=/var/tmp/nginx-zero-IPs # zero-tolerance IPs found in last check of current log NEWZEROS=/var/tmp/nginx-zeros-new # zero-tolerance IPs added from last check of current log OK_ZEROS=/var/tmp/nginx-zeros-gone # zero-tolerance IPs removed from last check of current log BADZEROS1=/var/log/nginx/deny-zeros.$LOG1DATE # zero-tolerance IPs found in the newest rotated log DENYLIST=/var/tmp/nginx-deny # deny-IP list for nginx (bad requests and bad bots) DENYCONF=/etc/nginx/blocked-ip.conf # IP block list in nginx's config directory LOG1DATE=`/bin/ls --full-time $LOG1 | /usr/bin/cut -d' ' -f6` # file's last-changed date # # Create new Deny files from the newly-rotated access.log.1 file if [ ! -e $BADREQS.$LOG1DATE -o $LOG1 -nt $BADREQS.$LOG1DATE ] ; then echo "Updating $BADREQS.$LOG1DATE." /usr/bin/cut -d':' -f1-2 $LOG1 | /usr/bin/sort | /usr/bin/uniq -c | /usr/bin/awk '$1 > '$MAXREQS' {print $2}' | /usr/bin/sort -t. -n -k1,1 -k2,2 -k3,3 -k4,4 | /usr/bin/uniq > $BADREQS.$LOG1DATE /bin/ls -l $BADREQS.$LOG1DATE fi if [ ! -e $BADHACKS.$LOG1DATE -o $LOG1 -nt $BADHACKS.$LOG1DATE ] ; then echo "Updating $BADHACKS.$LOG1DATE." /usr/bin/awk '$6 != "\"GET" && $6 != "\"HEAD" {print $1}' $LOG1 | /usr/bin/sort -t. -n -k1,1 -k2,2 -k3,3 -k4,4 | /usr/bin/uniq -c | /usr/bin/awk '$1 > '$MAXHACKS' {print $2}' > $BADHACKS.$LOG1DATE /bin/ls -l $BADHACKS.$LOG1DATE fi if [ ! -e $BAD404S.$LOG1DATE -o $LOG1 -nt $BAD404S.$LOG1DATE ] ; then echo "Updating $BAD404S.$LOG1DATE." /usr/bin/awk '$9 == 404' $LOG1 | /usr/bin/cut -d':' -f1-3 | sed 's/.$//' | /usr/bin/sort | /usr/bin/uniq -c | /usr/bin/awk '$1 > '$MAX404S' {print $2}' | /usr/bin/sort -t. -n -k1,1 -k2,2 -k3,3 -k4,4 | /usr/bin/uniq | /bin/egrep -v "$WHITELIST" > $BAD404S.$LOG1DATE /bin/ls -l $BAD404S.$LOG1DATE fi if [ ! -e $BADZEROS.$LOG1DATE -o $LOG1 -nt $BADZEROS.$LOG1DATE ] ; then echo "Updating $BADZEROS.$LOG1DATE." /bin/egrep "$ZEROLIST" $LOG1 | /bin/egrep -v "$ZEROALOW" | /usr/bin/cut -d' ' -f1 | /usr/bin/sort -t. -n -k1,1 -k2,2 -k3,3 -k4,4 | /usr/bin/uniq > $BADZEROS.$LOG1DATE /bin/ls -l $BADZEROS.$LOG1DATE echo # # Show feedback page use. /bin/grep -h /cgi-bin/mailer.bin $LOG1 | /bin/sed 's/" "/"/g' | /usr/bin/tr '?&"' '\012' | /usr/bin/tr '+=' ' \011' | /home/bchivers/webserver/sed.hexchars | /usr/bin/uniq fi # Check for access-log activity ($NOWREQS was the first file modified last time.) if [ -e $NOWREQS ] ; then # script has run before if [ $NOWREQS -nt $LOG ] ; then # no new log activity echo -n " `/bin/ls --full-time $NOWREQS | /usr/bin/cut -d' ' -f7 | /usr/bin/cut -c1-8`-`/bin/date +%T` no traffic\r" if [ $NOWREQS -nt $0 ] ; then # this script has not been updated exit # no updates needed fi echo " `/bin/date +%T` `/usr/bin/basename $0` script has been updated." fi fi echo -n " `/bin/date +%T` reviewing log \r" # IDENTIFY IPs OVERUSING MY WEBSERVER # count occurrences IPs in 1-hour windows # print IPs that occur more than $MAXREQS times # Search current log for overuse /usr/bin/cut -d':' -f1-2 $LOG | /usr/bin/sort | /usr/bin/uniq -c | /usr/bin/awk '$1 > '$MAXREQS' {print $2}' | /usr/bin/sort -t. -n -k1,1 -k2,2 -k3,3 -k4,4 | /usr/bin/uniq > $NOWREQS # # Check for new overusing IPs /usr/bin/diff $BADREQS $NOWREQS | /bin/grep '> ' | /usr/bin/cut -d' ' -f2 > $NEWREQS /usr/bin/diff $BADREQS $NOWREQS | /bin/grep '< ' | /usr/bin/cut -d' ' -f2 > $OK_REQS if [ -s $NEWREQS -o -s $OK_REQS ] ; then /bin/cat $NOWREQS > $BADREQS # update list for current access log fi # IDENTIFY IPs ATTACKING MY WEBSERVER # print IP of access.log entries that are not GET and not HEAD # count occurrences # print IPs that occur more than $MAXHACKS times # Search current log for attacks /usr/bin/awk '$6 != "\"GET" && $6 != "\"HEAD" {print $1}' $LOG | /usr/bin/sort -t. -n -k1,1 -k2,2 -k3,3 -k4,4 | /usr/bin/uniq -c | /usr/bin/awk '$1 > '$MAXHACKS' {print $2}' > $NOWHACKS # # Check for new attacking IPs /usr/bin/diff $BADHACKS $NOWHACKS | /bin/grep '> ' | /usr/bin/cut -d' ' -f2 > $NEWHACKS /usr/bin/diff $BADHACKS $NOWHACKS | /bin/grep '< ' | /usr/bin/cut -d' ' -f2 > $OK_HACKS if [ -s $NEWHACKS -o -s $OK_HACKS ] ; then /bin/cat $NOWHACKS > $BADHACKS # update list for current access log fi # IDENTIFY IPs PROBING MY WEBSERVER # print IP of access.log entries that returned 404 # count occurrences IPs in 10-min windows # print IPs that occur more than $MAX404S times # Search current log for probes # /usr/bin/awk '$9 == 404 {print $1}' $LOG | /usr/bin/sort -t. -n -k1,1 -k2,2 -k3,3 -k4,4 | /usr/bin/uniq -c | /usr/bin/awk '$1 > '$MAX404S' {print $2}' | /bin/egrep -v "$WHITELIST" > $NOW404S /usr/bin/awk '$9 == 404' $LOG | /usr/bin/cut -d':' -f1-3 | sed 's/.$//' | /usr/bin/sort | /usr/bin/uniq -c | /usr/bin/awk '$1 > '$MAX404S' {print $2}' | /usr/bin/sort -t. -n -k1,1 -k2,2 -k3,3 -k4,4 | /usr/bin/uniq | /bin/egrep -v "$WHITELIST" > $NOW404S # # Check for new probing IPs /usr/bin/diff $BAD404S $NOW404S | /bin/grep '> ' | /usr/bin/cut -d' ' -f2 > $NEW404S /usr/bin/diff $BAD404S $NOW404S | /bin/grep '< ' | /usr/bin/cut -d' ' -f2 > $OK_404S if [ -s $NEW404S -o -s $OK_404S ] ; then /bin/cat $NOW404S > $BAD404S # update list for current access log fi # IDENTIFY IPs of ZERO-TOLERANCE STRINGS (incl bad robots: "Adsbot/3.1") # egrep for forbidden strings in access.log. # keep IP /bin/egrep "$ZEROLIST" $LOG | /bin/egrep -v "$ZEROALOW" | /usr/bin/cut -d' ' -f1 | /usr/bin/sort -t. -n -k1,1 -k2,2 -k3,3 -k4,4 | /usr/bin/uniq > $NOWZEROS # list unique IPs awk version of uniq should be more efficient than sort|uniq for many events (thousands?) with few IPs. # # Search current log for zero-tolerance strings # /bin/grep "Adsbot/3.1" $LOG | /usr/bin/cut -d' ' -f1 | /usr/bin/awk '{ ++count[$0] ; if (count[$0] == 1) {print}}' > $NOWZEROS # # Check for new zero-tolerance IPs /usr/bin/diff $BADZEROS $NOWZEROS | /bin/grep '> ' | /usr/bin/cut -d' ' -f2 > $NEWZEROS /usr/bin/diff $BADZEROS $NOWZEROS | /bin/grep '< ' | /usr/bin/cut -d' ' -f2 > $OK_ZEROS if [ -s $NEWZEROS -o -s $OK_ZEROS ] ; then /bin/cat $NOWZEROS > $BADZEROS # update list for current access log fi # # Look-up IP changes if [ -s $NEWREQS -o -s $NEWHACKS -o -s $NEW404S -o -s $NEWZEROS -o -s $OK_REQS -o -s $OK_HACKS -o -s $OK_404S -o -s $OK_ZEROS ] ; then /bin/date fi if [ -s $NEWREQS ] ; then echo " New IPs overusing my webserver:" # /bin/cat -n $NEWREQS # /bin/cat $NEWREQS | /usr/bin/nslookup | /bin/grep 'name =' | /bin/cat -n /bin/cat $NEWREQS | $HOME/commands/ipnsl | /bin/cat -n fi if [ -s $OK_REQS ] ; then echo " IPs cleared of overusing my webserver:" /bin/cat $OK_REQS | $HOME/commands/ipnsl | /bin/cat -n fi if [ -s $NEWHACKS ] ; then echo " New IPs attacking my webserver:" /bin/cat $NEWHACKS | $HOME/commands/ipnsl | /bin/cat -n fi if [ -s $OK_HACKS ] ; then echo " IPs cleared of attacking my webserver:" /bin/cat $OK_HACKS | $HOME/commands/ipnsl | /bin/cat -n fi if [ -s $NEW404S ] ; then echo " New IPs probing (404s) my webserver:" /bin/cat $NEW404S | $HOME/commands/ipnsl | /bin/cat -n fi if [ -s $OK_404S ] ; then echo " IPs cleared of probing (404s) my webserver:" /bin/cat $OK_404S | $HOME/commands/ipnsl | /bin/cat -n fi if [ -s $NEWZEROS ] ; then echo " New IPs for zero-tolerance activity:" /bin/cat $NEWZEROS | $HOME/commands/ipnsl | /bin/cat -n fi if [ -s $OK_ZEROS ] ; then echo " IPs cleared as zero-tolerance activity:" /bin/cat $OK_ZEROS | $HOME/commands/ipnsl | /bin/cat -n fi # # Create new deny list and signal nginx web server if [ -s $NEWREQS -o -s $NEWHACKS -o -s $NEW404S -o -s $NEWZEROS -o -s $OK_REQS -o -s $OK_HACKS -o -s $OK_404S -o -s $OK_ZEROS ] ; then /usr/bin/sort -t. -n -k1,1 -k2,2 -k3,3 -k4,4 $BADREQS $BADHACKS $BAD404S $BADZEROS $BADREQS.$LOG1DATE $BADHACKS.$LOG1DATE $BAD404S.$LOG1DATE $BADZEROS.$LOG1DATE | /usr/bin/uniq > $DENYLIST # /usr/bin/sort -t. -n -k1,1 -k2,2 -k3,3 -k4,4 $BADREQS $BADHACKS $BAD404S $BADZEROS $BADREQS.$LOG1DATE $BADHACKS.$LOG1DATE $BAD404S.$LOG1DATE $BADZEROS.$LOG1DATE | /usr/bin/uniq > $DENYLIST echo 'geo $bad_ip {' > $DENYCONF echo ' default 0;' >> $DENYCONF # TAB bad_ip 1; # add these lines from $DENYLIST /bin/sed -e 's/^/ /' -e 's/$/ 1;/' $DENYLIST >> $DENYCONF echo '}' >> $DENYCONF # echo " Restarting nginx with updated IP-Deny list ($DENYCONF)." # service nginx restart # This script should be run by the account that runs nginx. echo " Reloading nginx config with updated IP-Deny list ($DENYCONF)." /usr/sbin/nginx -s reload # This script should be run by the account that runs nginx. else echo -n " `/bin/date +%T` reviewed log \r" fi