Skip to content

Commit 2fefe6c

Browse files
Merge pull request #58 from SuffolkLITLab/check-spam-with-gemini
Add spam classifier with google gemini flash 2.0 experimental and check for spam earlier
2 parents f99f349 + 0b7ec6c commit 2fefe6c

File tree

4 files changed

+111
-25
lines changed

4 files changed

+111
-25
lines changed

docassemble/GithubFeedbackForm/data/questions/feedback.yml

Lines changed: 35 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -81,11 +81,11 @@ subquestion: |
8181
The information you type here will be publicly available. That means anyone
8282
will be able to see it. Use this form to tell us about problems that do not
8383
include any personal information.
84-
84+
8585
${ collapse_template(al_how_to_get_legal_help) }
8686
8787
Thank you for telling us about your experience with this website.
88-
88+
8989
fields:
9090
- no label: reason
9191
input type: radio
@@ -158,12 +158,12 @@ decoration: opinion
158158
subquestion: |
159159
This form is designed to be used by testers and community
160160
stakeholders.
161-
161+
162162
The information you type here will be publicly available. That means anyone
163163
will be able to see it.
164-
164+
165165
${ collapse_template(al_how_to_get_legal_help) }
166-
166+
167167
fields:
168168
- "**What would you like to tell us about?**": reason
169169
input type: radio
@@ -236,7 +236,7 @@ subject: |
236236
Do you need more help?
237237
content: |
238238
If you need more help, these are free resources:
239-
239+
240240
- [Find Free Legal help - Legal Services Corporation](https://www.lsc.gov/about-lsc/what-legal-aid/get-legal-help)
241241
242242
- [Hire a lawyer](https://www.americanbar.org/groups/legal_services/flh-home/flh-hire-a-lawyer/)
@@ -248,7 +248,7 @@ question: |
248248
decoration: lifebuoy
249249
subquestion: |
250250
We are sorry that we couldn't do more to help you.
251-
251+
252252
${ al_how_to_get_legal_help }
253253
254254
buttons:
@@ -261,7 +261,7 @@ question: |
261261
decoration: opinion
262262
subquestion: |
263263
We appreciate you letting us know how we are doing.
264-
264+
265265
% if issue_url:
266266
If you would like to track this issue, you can [follow
267267
it](${issue_url}) on GitHub.
@@ -304,33 +304,47 @@ content: |
304304
% endif
305305
---
306306
########################## Send to GitHub code ##########################
307+
only sets:
308+
- note_issue
307309
need:
308310
- should_send_to_github
309311
- question_id
310312
- variable
311313
- package_version
312314
- filename
313315
code: |
314-
if not task_performed('issue noted', persistent=True):
315-
saved_uuid
316-
if showifdef('would_be_on_panel', False):
316+
if task_performed('issue noted'):
317+
pass
318+
elif is_likely_spam(issue_template.content):
319+
log("Not saving feedback because it looks like spam")
320+
mark_task_as_performed('issue noted', persistent=True)
321+
issue_url, saved_uuid = None
322+
note_issue = False # End block early
323+
else:
324+
saved_uuid # Trigger the code to save locally on the server and optionally link the session answers. We do this regardless of whether we send to GitHub
325+
if showifdef('would_be_on_panel'):
317326
add_panel_participant(panel_email)
327+
318328
if should_send_to_github:
319-
issue_url
320-
if issue_url:
321-
if saved_uuid:
322-
set_feedback_github_url(saved_uuid, issue_url)
329+
issue_url # Trigger the code to save as a GitHub issue
330+
if issue_url and saved_uuid:
331+
# Link the GitHub issue to the saved feedback in database
332+
set_feedback_github_url(saved_uuid, issue_url)
323333
else:
324-
al_error_email
325334
log(f"This form was not able to add an issue on the {github_user}/{github_repo} repo. Check your config.")
326-
if al_error_email and not is_likely_spam(issue_template.content):
335+
if al_error_email:
327336
log(f"Unable to create issue on repo {github_repo}, falling back to emailing {al_error_email}")
328-
send_email(to=al_error_email, subject=f"{github_repo} - {issue_template.subject_as_html(trim=True)}", template=issue_template)
337+
send_email(
338+
to=al_error_email,
339+
subject=f"{github_repo} - {issue_template.subject_as_html(trim=True)}",
340+
template=issue_template
341+
)
329342
else:
330-
log(f"~~~USER FEEDBACK~~~ {github_repo} -{issue_template.subject_as_html(trim=True)} - {issue_template.content_as_html(trim=True)}")
343+
log(f"~~~USER FEEDBACK~~~ {github_repo} - {issue_template.subject_as_html(trim=True)} - {issue_template.content_as_html(trim=True)}")
344+
else:
345+
issue_url = None
331346
mark_task_as_performed('issue noted', persistent=True)
332-
else:
333-
log("Already sent feedback to github from a feedback interview, not going to send again")
347+
334348
note_issue = True
335349
---
336350
code: |

docassemble/GithubFeedbackForm/github_issue.py

Lines changed: 71 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,11 @@
66
from docassemble.base.util import log, get_config, interview_url
77
import re
88

9+
try:
10+
import google.generativeai as genai
11+
except ImportError:
12+
pass
13+
914
# reference: https://gist.github.com/JeffPaine/3145490
1015
# https://docs.github.com/en/free-pro-team@latest/rest/reference/issues#create-an-issue
1116

@@ -16,6 +21,7 @@
1621
"make_github_issue",
1722
"feedback_link",
1823
"is_likely_spam",
24+
"is_likely_spam_from_genai",
1925
"prefill_github_issue_url",
2026
]
2127
USERNAME = get_config("github issues", {}).get("username")
@@ -168,8 +174,69 @@ def feedback_link(
168174
)
169175

170176

177+
def is_likely_spam_from_genai(
178+
body: Optional[str],
179+
context: Optional[str] = None,
180+
gemini_api_key: Optional[str] = None,
181+
model="gemini-2.0-flash-exp",
182+
) -> bool:
183+
"""
184+
Check if the body of the issue is likely spam with the help of Google Gemini Flash experimental.
185+
186+
Args:
187+
body (Optional[str]): the body of the issue
188+
context (Optional[str]): the context of the issue to help rate it as spam or not, defaults to a guided interview in the legal context
189+
gemini_api_key (Optional[str]): the API key for the Google Gemini Flash API, can be specified in the global config as `google gemini api key`
190+
model (Optional[str]): the model to use for the spam detection, defaults to "gemini-2.0-flash-exp", can be specified in the global config
191+
as `github issues: spam model`
192+
"""
193+
if not body:
194+
return False
195+
196+
model = model or get_config("github issues", {}).get(
197+
"spam model", "gemini-2.0-flash-exp"
198+
)
199+
gemini_api_key = gemini_api_key or get_config("google gemini api key")
200+
201+
if not gemini_api_key: # not passed as a parameter OR in the global config
202+
log("Not using Google Gemini Flash to check for spam: no API key provided")
203+
return False
204+
205+
if context is None: # empty string is a valid input
206+
context = "a guided interview in the legal context"
207+
208+
try:
209+
genai.configure(api_key=gemini_api_key)
210+
model = genai.GenerativeModel(
211+
model_name=model,
212+
system_instruction=f"""
213+
You are reviewing a feedback form for {context}. Your job is to allow as many
214+
relevant feedback responses as possible while filtering out irrelevant and spam feedback,
215+
especially targeted advertising that isn't pointing out a problem on the guided interview.
216+
217+
Rate the user's feedback as 'spam' or 'not spam' based on the context of the guided interview.
218+
Answer only with the exact keywords: 'spam' or 'not spam'.
219+
""",
220+
)
221+
222+
response = model.generate_content(body)
223+
if response.text.strip() == "spam":
224+
return True
225+
except NameError:
226+
log(
227+
f"Error using Google Gemini Flash: the `google.generativeai` module is not available"
228+
)
229+
except Exception as e:
230+
log(f"Error using Google Gemini Flash: {e}")
231+
return False
232+
return False
233+
234+
171235
def is_likely_spam(
172-
body: Optional[str], keywords: Optional[List[str]] = None, filter_urls: bool = True
236+
body: Optional[str],
237+
keywords: Optional[List[str]] = None,
238+
filter_urls: bool = True,
239+
model: Optional[str] = None,
173240
) -> bool:
174241
"""
175242
Check if the body of the issue is likely spam based on a set of keywords and URLs.
@@ -179,9 +246,10 @@ def is_likely_spam(
179246
180247
Args:
181248
body (Optional[str]): the body of the issue
182-
keywords (Optional[List[str]]): a list of keywords that are likely spam, defaults to a set of keywords
249+
keywords (Optional[List[str]]): a list of additional keywords that are likely spam, defaults to a set of keywords
183250
from the global configuration under the `github issues: spam keywords` key
184251
"""
252+
185253
_urls = ["leadgeneration.com", "leadmagnet.com"]
186254
_keywords = [
187255
"100 times more effective",
@@ -244,7 +312,7 @@ def is_likely_spam(
244312
if re.search(url_regex, body):
245313
return True
246314

247-
return False
315+
return is_likely_spam_from_genai(body, model=model)
248316

249317

250318
def prefill_github_issue_url(

pyproject.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,7 @@ exclude = '''(?x)(
1010
[[tool.mypy.overrides]]
1111
module = "docassemble.base.*"
1212
ignore_missing_imports = true
13+
14+
[[tool.mypy.overrides]]
15+
module = "google.*"
16+
ignore_missing_imports = true

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def find_package_data(where='.', package='', exclude=standard_exclude, exclude_d
5353
url='https://courtformsonline.org',
5454
packages=find_packages(),
5555
namespace_packages=['docassemble'],
56-
install_requires=['docassemble.ALToolbox>=0.6.0'],
56+
install_requires=['docassemble.ALToolbox>=0.6.0', 'google-generativeai'],
5757
zip_safe=False,
5858
package_data=find_package_data(where='docassemble/GithubFeedbackForm/', package='docassemble.GithubFeedbackForm'),
5959
)

0 commit comments

Comments
 (0)