Skip to content

Commit f99f349

Browse files
Merge pull request #57 from SuffolkLITLab/update-spam-filters
Add more keywords based on recent experience with Vermont; also check for URLs
2 parents 4ac0469 + 9954759 commit f99f349

File tree

1 file changed

+73
-19
lines changed

1 file changed

+73
-19
lines changed

docassemble/GithubFeedbackForm/github_issue.py

Lines changed: 73 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from typing import Dict, Optional, List, Union, Any
55
from urllib.parse import urlencode, quote_plus
66
from docassemble.base.util import log, get_config, interview_url
7+
import re
78

89
# reference: https://gist.github.com/JeffPaine/3145490
910
# https://docs.github.com/en/free-pro-team@latest/rest/reference/issues#create-an-issue
@@ -167,29 +168,82 @@ def feedback_link(
167168
)
168169

169170

170-
def is_likely_spam(body: Optional[str]) -> bool:
171+
def is_likely_spam(
172+
body: Optional[str], keywords: Optional[List[str]] = None, filter_urls: bool = True
173+
) -> bool:
174+
"""
175+
Check if the body of the issue is likely spam based on a set of keywords and URLs.
176+
177+
Some keywords are hardcoded, but additional keywords can be added to the global config
178+
or passed as parameters, or both.
179+
180+
Args:
181+
body (Optional[str]): the body of the issue
182+
keywords (Optional[List[str]]): a list of keywords that are likely spam, defaults to a set of keywords
183+
from the global configuration under the `github issues: spam keywords` key
184+
"""
185+
_urls = ["leadgeneration.com", "leadmagnet.com"]
186+
_keywords = [
187+
"100 times more effective",
188+
"adult dating",
189+
"backlink",
190+
"backlinks",
191+
"binary options",
192+
"bitcoin investment",
193+
"cheap hosting",
194+
"cheap meds",
195+
"cialis",
196+
"credit repair fast",
197+
"earn money online",
198+
"email me",
199+
"escort service",
200+
"forex trading",
201+
"free gift cards",
202+
"free trial",
203+
"get rich quick",
204+
"increase website traffic",
205+
"international long distance calling",
206+
"keep this info confidential",
207+
"lead feature",
208+
"lead generation",
209+
"lottery winner",
210+
"market your business",
211+
"nigerian prince",
212+
"online casino",
213+
"payment/deposit handler",
214+
"reliable business representative",
215+
"remote job opportunity",
216+
"results are astounding",
217+
"send an email",
218+
"seo services",
219+
"split the funds",
220+
"turkish bank",
221+
"unsubscribe",
222+
"viagra",
223+
"visit this link",
224+
"web lead",
225+
"web visitors",
226+
"work from home",
227+
"your late relative",
228+
]
229+
230+
if not keywords:
231+
keywords = []
232+
keywords += _keywords + _urls
233+
234+
keywords += get_config("github issues", {}).get("spam keywords", [])
235+
171236
if not body:
172237
return False
173238
body = body.lower()
174-
if any([url in body for url in {"leadgeneration.com", "leadmagnet.com"}]):
175-
return True
176-
if any(
177-
[
178-
keyword in body
179-
for keyword in {
180-
"free trial",
181-
"unsubscribe",
182-
"web visitors into leads",
183-
"international long distance calling",
184-
"100 times more effective",
185-
"web visitors",
186-
"lead feature",
187-
"web lead",
188-
"lead generation",
189-
}
190-
]
191-
):
239+
if any([keyword in body for keyword in keywords]):
192240
return True
241+
242+
if filter_urls:
243+
url_regex = re.compile(r"(https?:\/\/[^\s]+)", flags=re.IGNORECASE)
244+
if re.search(url_regex, body):
245+
return True
246+
193247
return False
194248

195249

0 commit comments

Comments
 (0)