|
4 | 4 | from typing import Dict, Optional, List, Union, Any |
5 | 5 | from urllib.parse import urlencode, quote_plus |
6 | 6 | from docassemble.base.util import log, get_config, interview_url |
| 7 | +import re |
7 | 8 |
|
8 | 9 | # reference: https://gist.github.com/JeffPaine/3145490 |
9 | 10 | # https://docs.github.com/en/free-pro-team@latest/rest/reference/issues#create-an-issue |
@@ -167,29 +168,82 @@ def feedback_link( |
167 | 168 | ) |
168 | 169 |
|
169 | 170 |
|
170 | | -def is_likely_spam(body: Optional[str]) -> bool: |
| 171 | +def is_likely_spam( |
| 172 | + body: Optional[str], keywords: Optional[List[str]] = None, filter_urls: bool = True |
| 173 | +) -> bool: |
| 174 | + """ |
| 175 | + Check if the body of the issue is likely spam based on a set of keywords and URLs. |
| 176 | +
|
| 177 | + Some keywords are hardcoded, but additional keywords can be added to the global config |
| 178 | + or passed as parameters, or both. |
| 179 | +
|
| 180 | + Args: |
| 181 | + body (Optional[str]): the body of the issue |
| 182 | + keywords (Optional[List[str]]): a list of keywords that are likely spam, defaults to a set of keywords |
| 183 | + from the global configuration under the `github issues: spam keywords` key |
| 184 | + """ |
| 185 | + _urls = ["leadgeneration.com", "leadmagnet.com"] |
| 186 | + _keywords = [ |
| 187 | + "100 times more effective", |
| 188 | + "adult dating", |
| 189 | + "backlink", |
| 190 | + "backlinks", |
| 191 | + "binary options", |
| 192 | + "bitcoin investment", |
| 193 | + "cheap hosting", |
| 194 | + "cheap meds", |
| 195 | + "cialis", |
| 196 | + "credit repair fast", |
| 197 | + "earn money online", |
| 198 | + "email me", |
| 199 | + "escort service", |
| 200 | + "forex trading", |
| 201 | + "free gift cards", |
| 202 | + "free trial", |
| 203 | + "get rich quick", |
| 204 | + "increase website traffic", |
| 205 | + "international long distance calling", |
| 206 | + "keep this info confidential", |
| 207 | + "lead feature", |
| 208 | + "lead generation", |
| 209 | + "lottery winner", |
| 210 | + "market your business", |
| 211 | + "nigerian prince", |
| 212 | + "online casino", |
| 213 | + "payment/deposit handler", |
| 214 | + "reliable business representative", |
| 215 | + "remote job opportunity", |
| 216 | + "results are astounding", |
| 217 | + "send an email", |
| 218 | + "seo services", |
| 219 | + "split the funds", |
| 220 | + "turkish bank", |
| 221 | + "unsubscribe", |
| 222 | + "viagra", |
| 223 | + "visit this link", |
| 224 | + "web lead", |
| 225 | + "web visitors", |
| 226 | + "work from home", |
| 227 | + "your late relative", |
| 228 | + ] |
| 229 | + |
| 230 | + if not keywords: |
| 231 | + keywords = [] |
| 232 | + keywords += _keywords + _urls |
| 233 | + |
| 234 | + keywords += get_config("github issues", {}).get("spam keywords", []) |
| 235 | + |
171 | 236 | if not body: |
172 | 237 | return False |
173 | 238 | body = body.lower() |
174 | | - if any([url in body for url in {"leadgeneration.com", "leadmagnet.com"}]): |
175 | | - return True |
176 | | - if any( |
177 | | - [ |
178 | | - keyword in body |
179 | | - for keyword in { |
180 | | - "free trial", |
181 | | - "unsubscribe", |
182 | | - "web visitors into leads", |
183 | | - "international long distance calling", |
184 | | - "100 times more effective", |
185 | | - "web visitors", |
186 | | - "lead feature", |
187 | | - "web lead", |
188 | | - "lead generation", |
189 | | - } |
190 | | - ] |
191 | | - ): |
| 239 | + if any([keyword in body for keyword in keywords]): |
192 | 240 | return True |
| 241 | + |
| 242 | + if filter_urls: |
| 243 | + url_regex = re.compile(r"(https?:\/\/[^\s]+)", flags=re.IGNORECASE) |
| 244 | + if re.search(url_regex, body): |
| 245 | + return True |
| 246 | + |
193 | 247 | return False |
194 | 248 |
|
195 | 249 |
|
|
0 commit comments