From 192c11872b20922044e0dc777e9544d4c5df8d51 Mon Sep 17 00:00:00 2001 From: Gabriel Arrouye <33171826+Riderfighter@users.noreply.github.com> Date: Mon, 1 Sep 2025 13:57:08 +0200 Subject: [PATCH 1/3] Better regex for finding proxies + deduplication of proxies based on the exit IP --- src/output.rs | 18 ++++++++++++++++++ src/parsers.rs | 2 +- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/src/output.rs b/src/output.rs index eb82a518c..063309aa8 100644 --- a/src/output.rs +++ b/src/output.rs @@ -73,6 +73,24 @@ pub async fn save_proxies( proxies.sort_unstable_by(compare_natural); } + // Deduplicate proxies by exit_ip when available. Different proxies can exit via the same IP. + // We do this after sorting so that if sorted by speed, the fastest one is kept. + { + // Track seen exit_ip per protocol to avoid cross-protocol removal + let mut seen: std::collections::HashSet<(ProxyType, String)> = std::collections::HashSet::new(); + let mut deduped = Vec::with_capacity(proxies.len()); + for p in proxies.into_iter() { + if let Some(ref ip) = p.exit_ip { + let key = (p.protocol, ip.clone()); + if !seen.insert(key) { + continue; + } + } + deduped.push(p); + } + proxies = deduped; + } + if config.output.json.enabled { let (maybe_asn_db, maybe_geo_db) = tokio::try_join!( async { diff --git a/src/parsers.rs b/src/parsers.rs index 902e76fc8..df97f224e 100644 --- a/src/parsers.rs +++ b/src/parsers.rs @@ -1,7 +1,7 @@ use std::sync::LazyLock; pub static PROXY_REGEX: LazyLock = LazyLock::new(|| { - let pattern = r"(?:^|[^0-9A-Za-z])(?:(?Phttps?|socks[45]):\/\/)?(?:(?P[0-9A-Za-z]{1,64}):(?P[0-9A-Za-z]{1,64})@)?(?P[A-Za-z][\-\.A-Za-z]{0,251}[A-Za-z]|[A-Za-z]|(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(?:\.(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])){3}):(?P[0-9]|[1-9][0-9]{1,3}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])(?=[^0-9A-Za-z]|$)"; + let pattern = r"(?:^|[^0-9A-Za-z])(?:(?Phttps?|socks[45]):\/\/)?(?:(?P[0-9A-Za-z._~-]{1,64}):(?P[0-9A-Za-z._~-]{1,64})@)?(?P[A-Za-z][\-\.A-Za-z]{0,251}[A-Za-z]|[A-Za-z]|(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(?:\.(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])){3}):(?P[0-9]|[1-9][0-9]{1,3}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])(?=[^0-9A-Za-z]|$)"; fancy_regex::RegexBuilder::new(pattern) .backtrack_limit(usize::MAX) .build() From 0c51e22aed2394caff66f76a1cd404367d5774d2 Mon Sep 17 00:00:00 2001 From: Gabriel Arrouye <33171826+Riderfighter@users.noreply.github.com> Date: Tue, 2 Sep 2025 17:04:49 +0200 Subject: [PATCH 2/3] Better regex for finding proxies + deduplication of proxies based on the exit IP --- src/output.rs | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/src/output.rs b/src/output.rs index 063309aa8..e72010395 100644 --- a/src/output.rs +++ b/src/output.rs @@ -10,11 +10,11 @@ use color_eyre::eyre::WrapErr as _; use itertools::Itertools as _; use crate::{ - HashMap, config::Config, ipdb, proxy::{Proxy, ProxyType}, utils::is_docker, + HashMap, }; fn compare_timeout(a: &Proxy, b: &Proxy) -> Ordering { @@ -75,21 +75,20 @@ pub async fn save_proxies( // Deduplicate proxies by exit_ip when available. Different proxies can exit via the same IP. // We do this after sorting so that if sorted by speed, the fastest one is kept. - { - // Track seen exit_ip per protocol to avoid cross-protocol removal - let mut seen: std::collections::HashSet<(ProxyType, String)> = std::collections::HashSet::new(); - let mut deduped = Vec::with_capacity(proxies.len()); - for p in proxies.into_iter() { - if let Some(ref ip) = p.exit_ip { - let key = (p.protocol, ip.clone()); - if !seen.insert(key) { - continue; - } + // Track seen exit_ip per protocol to avoid cross-protocol removal + let mut seen: std::collections::HashSet<(ProxyType, String)> = + std::collections::HashSet::new(); + let mut deduped = Vec::with_capacity(proxies.len()); + for p in proxies { + if let Some(ip) = &p.exit_ip { + let key = (p.protocol, ip.clone()); + if !seen.insert(key) { + continue; } - deduped.push(p); } - proxies = deduped; + deduped.push(p); } + proxies = deduped; if config.output.json.enabled { let (maybe_asn_db, maybe_geo_db) = tokio::try_join!( From 234882f9d0596a32a38c9330c9e36d36cfaa6b8c Mon Sep 17 00:00:00 2001 From: Gabriel Arrouye <33171826+Riderfighter@users.noreply.github.com> Date: Tue, 2 Sep 2025 20:35:11 +0200 Subject: [PATCH 3/3] Improve regex --- src/parsers.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/parsers.rs b/src/parsers.rs index df97f224e..65a2f7866 100644 --- a/src/parsers.rs +++ b/src/parsers.rs @@ -1,13 +1,12 @@ use std::sync::LazyLock; pub static PROXY_REGEX: LazyLock = LazyLock::new(|| { - let pattern = r"(?:^|[^0-9A-Za-z])(?:(?Phttps?|socks[45]):\/\/)?(?:(?P[0-9A-Za-z._~-]{1,64}):(?P[0-9A-Za-z._~-]{1,64})@)?(?P[A-Za-z][\-\.A-Za-z]{0,251}[A-Za-z]|[A-Za-z]|(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(?:\.(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])){3}):(?P[0-9]|[1-9][0-9]{1,3}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])(?=[^0-9A-Za-z]|$)"; + let pattern = r"(?:^|[^0-9A-Za-z])(?:(?Phttps?|socks[45]):\/\/)?(?:(?P[0-9A-Za-z._~\-]{1,256}):(?P[0-9A-Za-z._~\-]{1,256})@)?(?P[A-Za-z][\-\.A-Za-z]{0,251}[A-Za-z]|[A-Za-z]|(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(?:\.(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])){3}):(?P[0-9]|[1-9][0-9]{1,3}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])(?=[^0-9A-Za-z]|$)"; fancy_regex::RegexBuilder::new(pattern) .backtrack_limit(usize::MAX) .build() .unwrap() }); - static IPV4_REGEX: LazyLock = LazyLock::new(|| { let pattern = r"^\s*(?P(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(?:\.(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])){3})(?::(?:[0-9]|[1-9][0-9]{1,3}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5]))?\s*$"; fancy_regex::Regex::new(pattern).unwrap()