Skip to content

Commit 828a12c

Browse files
committed
Add google bot IP "firewall"
[minor]
1 parent 5db058d commit 828a12c

6 files changed

Lines changed: 202 additions & 191 deletions

File tree

CLAUDE.md

Lines changed: 0 additions & 187 deletions
This file was deleted.

README.md

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,9 +62,10 @@ services:
6262
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.captchaProvider: turnstile
6363
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.siteKey: ${TURNSTILE_SITE_KEY}
6464
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.secretKey: ${TURNSTILE_SECRET_KEY}
65-
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.goodBots: apple.com,archive.org,commoncrawl.org,duckduckgo.com,facebook.com,google.com,googlebot.com,googleusercontent.com,instagram.com,kagibot.org,linkedin.com,msn.com,openalex.org,twitter.com,x.com
65+
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.goodBots: apple.com,archive.org,commoncrawl.org,duckduckgo.com,facebook.com,google.com,instagram.com,kagibot.org,linkedin.com,msn.com,openalex.org,twitter.com,x.com
6666
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.persistentStateFile: /tmp/state.json
6767
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.enableStateReconciliation: "false"
68+
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.enableGooglebotIPCheck: "true"
6869
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.periodSeconds: 30
6970
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.failureThreshold: 3
7071
networks:
@@ -82,7 +83,7 @@ services:
8283
--providers.docker=true
8384
--providers.docker.network=default
8485
--experimental.plugins.captcha-protect.modulename=github.com/libops/captcha-protect
85-
--experimental.plugins.captcha-protect.version=v1.11.1
86+
--experimental.plugins.captcha-protect.version=v1.12.0
8687
volumes:
8788
- /var/run/docker.sock:/var/run/docker.sock:z
8889
- /CHANGEME/TO/A/HOST/PATH/FOR/STATE/FILE:/tmp/state.json:rw
@@ -128,6 +129,7 @@ services:
128129
| `enableStatsPage` | `string` | `"false"` | Allows `exemptIps` to access `/captcha-protect/stats` to monitor the rate limiter. |
129130
| `logLevel` | `string` | `"INFO"` | Log level for the middleware. Options: `ERROR`, `WARNING`, `INFO`, or `DEBUG`. |
130131
| `persistentStateFile` | `string` | `""` | File path to persist rate limiter state across Traefik restarts. In Docker, mount this file from the host. |
132+
| `enableGooglebotIPCheck` | `string` | `"false"` | Treat google bot IPs are good bots |
131133
| `enableStateReconciliation` | `string` | `"false"` | When `"true"`, reads and merges disk state before each save to prevent multiple instances from overwriting data. Adds extra I/O overhead. Only enable for multi-instance deployments sharing state. **Performance warning**: Not recommended for sites with >1M unique visitors due to reconciliation overhead (5-8s per cycle at scale). |
132134

133135
### Circuit Breaker (failover if a captcha provider is unavailable)
@@ -152,14 +154,17 @@ The circuit breaker provides automatic failover when the primary captcha provide
152154

153155
### Good Bots
154156

155-
To avoid having this middleware impact your SEO score, it's recommended to provide a value for `goodBots`. By default, no bots will be allowed to crawl your protected routes beyond the rate limit unless their second level domain (e.g. `google.com`) is configured as a good bot.
157+
To avoid having this middleware impact your SEO score, it's recommended to provide a value for `goodBots`. By default, no bots will be allowed to crawl your protected routes beyond the rate limit unless their second level domain (e.g. `bing.com`) is configured as a good bot.
156158

157159
A good default value for `goodBots` would be:
158160

159161
```
160-
goodBots: apple.com,archive.org,duckduckgo.com,facebook.com,google.com,googlebot.com,googleusercontent.com,instagram.com,kagibot.org,linkedin.com,msn.com,openalex.org,twitter.com,x.com
162+
enableGooglebotIPCheck: "true"
163+
goodBots: apple.com,archive.org,duckduckgo.com,facebook.com,google.com,instagram.com,kagibot.org,linkedin.com,msn.com,openalex.org,twitter.com,x.com
161164
```
162165

166+
Since google publishes their bot IPs, we can also leverage their API to let google based on IP. This can be enabled with enableGooglebotIPCheck: "true"
167+
163168
**However** if you set the config parameter `protectParameters="true"`, even good bots won't be allowed to crawl protected routes if a URL parameter is on the request (e.g. `/foo?bar=baz`). This `protectParameters` feature is meant to help protect faceted search pages.
164169

165170

ci/docker-compose.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ services:
1919
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.ipForwardedHeader: "X-Forwarded-For"
2020
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.logLevel: "DEBUG"
2121
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.goodBots: ""
22+
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.enableGooglebotIPCheck: "true"
2223
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.protectRoutes: "/"
2324
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.persistentStateFile: "/tmp/state.json"
2425
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.enableStateReconciliation: "true"
@@ -48,6 +49,7 @@ services:
4849
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.ipForwardedHeader: "X-Forwarded-For"
4950
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.logLevel: "DEBUG"
5051
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.goodBots: ""
52+
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.enableGooglebotIPCheck: "true"
5153
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.protectRoutes: "/"
5254
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.persistentStateFile: "/tmp/state.json"
5355
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.enableStateReconciliation: "true"

internal/state/google.go

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
package state
2+
3+
import (
4+
"log/slog"
5+
"net"
6+
"sync"
7+
)
8+
9+
// GooglebotIPs holds the list of Googlebot IP ranges, providing a thread-safe way to check if an IP is a Googlebot.
10+
type GooglebotIPs struct {
11+
cidrs []*net.IPNet
12+
mu sync.RWMutex
13+
}
14+
15+
// NewGooglebotIPs creates and initializes a new GooglebotIPs object.
16+
func NewGooglebotIPs() *GooglebotIPs {
17+
return &GooglebotIPs{
18+
cidrs: make([]*net.IPNet, 0),
19+
}
20+
}
21+
22+
// Update parses a slice of CIDR strings and replaces the existing IP ranges with the new ones.
23+
// It logs an error for any CIDR string that fails to parse.
24+
func (g *GooglebotIPs) Update(cidrs []string, log *slog.Logger) {
25+
g.mu.Lock()
26+
defer g.mu.Unlock()
27+
28+
g.cidrs = make([]*net.IPNet, 0, len(cidrs))
29+
30+
for _, s := range cidrs {
31+
_, network, err := net.ParseCIDR(s)
32+
if err != nil {
33+
log.Error("error parsing CIDR", "cidr", s, "err", err)
34+
35+
continue
36+
}
37+
38+
g.cidrs = append(g.cidrs, network)
39+
}
40+
}
41+
42+
// Contains checks if the given IP address is within any of the stored Googlebot IP ranges.
43+
func (g *GooglebotIPs) Contains(ip net.IP) bool {
44+
g.mu.RLock()
45+
defer g.mu.RUnlock()
46+
47+
for _, network := range g.cidrs {
48+
if network.Contains(ip) {
49+
return true
50+
}
51+
}
52+
53+
return false
54+
}

0 commit comments

Comments
 (0)