[fix]: minor changes of mistakes and update to new model version in hf

SA-Yur-or · SA-Yur-or · commit 8e9f2d810c0c · 2024-12-17T16:29:25.000+04:00
diff --git a/README.md b/README.md
@@ -19,8 +19,8 @@ To integrate the detector with your project on the SuperAnnotate platform, pleas
 The Generated Text Detection model is built on a fine-tuned RoBERTa Large architecture. It has been extensively trained on a diverse dataset that includes internal generation and subset of RAID train dataset, enabling it to accurately classify text as either generated (synthetic) or human-written. \
 This model is optimized for robust detection, offering two configurations based on specific needs:
 
-- **Optimized for Low False Positive Rate (FPR):** [AI Detector](https://huggingface.co/SuperAnnotate/ai-detector)
-- **Optimized for High Overall Prediction Accuracy:** [LLM Content Detector V2](https://huggingface.co/SuperAnnotate/roberta-large-llm-content-detector-V2)
+- **Optimized for Low False Positive Rate (FPR):** [AI Detector Low FPR](https://huggingface.co/SuperAnnotate/ai-detector-low-fpr)
+- **Optimized for High Overall Prediction Accuracy:** [AI Detector](https://huggingface.co/SuperAnnotate/ai-detector)
 
 For more details and access to the model weights, please refer to the links above on the Hugging Face Model Hub.
 
@@ -52,7 +52,9 @@ Hardware requirements will depend on your on your deployment type. Recommended e
 ### As python file ###
 
 1. Install requirements: `pip install -r generated_text_detector/requirements.txt`
-2. Set the Python path variable: `export PYTHONPATH="."`
+2. Set the Python path variable:
+  - `export PYTHONPATH="."`
+  - `export DETECTOR_CONFIG_PATH="etc/configs/detector_config.json"`
 3. Run the API: `uvicorn --host 0.0.0.0 --port 8080 --ssl-keyfile=./key.pem --ssl-certfile=./cert.pem generated_text_detector.fastapi_app:app`
 
 ### As docker containers ###
diff --git a/etc/configs/detector_config.json b/etc/configs/detector_config.json
@@ -1,4 +1,4 @@
 {
-    "text_detector_model": "SuperAnnotate/roberta-large-llm-content-detector",
+    "text_detector_model": "SuperAnnotate/ai-detector",
     "code_default_probability": 0.5
 }
diff --git a/generated_text_detector/fastapi_app.py b/generated_text_detector/fastapi_app.py
@@ -38,7 +38,7 @@
 def parse_args():
     DEFAULT_HOST = "0.0.0.0"
     DEFAULT_PORT = "8080"
-    DEFAULT_DETECTOR_CONFIG_PATH = "etc/detector_config.json"
+    DEFAULT_DETECTOR_CONFIG_PATH = "etc/configs/detector_config.json"
     DEFAULT_DEVICE = "cuda:0"
     
     parser = argparse.ArgumentParser(
diff --git a/generated_text_detector/requirements.txt b/generated_text_detector/requirements.txt
@@ -1,4 +1,5 @@
 fastapi==0.110.0
+numpy==1.25.2
 nltk==3.8.1
 starlette==0.36.3
 torch==2.2.1
diff --git a/generated_text_detector/utils/aggregated_detector.py b/generated_text_detector/utils/aggregated_detector.py
@@ -26,7 +26,7 @@ def __init__(
         self.code_block_pattern = re.compile(r"```(\w+)?\s*([\s\S]*?)\s*```")
 
 
-    def __split_text_and_code(self, text: str) -> tuple[str, str, str]:
+    def __split_text_and_code(self, text: str) -> tuple[str, str]:
         """Split input text to text and code blocks.
         
         :param text: Input text
@@ -44,13 +44,13 @@ def __split_text_and_code(self, text: str) -> tuple[str, str, str]:
         return text, code
 
         
-    def detect_report(self, text: str) -> list[tuple[str, float]]:
+    def detect_report(self, text: str) -> dict:
         """Detects if text is generated and prepare a report.
 
         :param text: Input text
         :type text: str
         :return: Text chunks with generated scores
-        :rtype: list[tuple[str, float]]
+        :rtype: dict with keys: 'generated_score' and 'author'
         """
         text, code = self.__split_text_and_code(text)
 
@@ -124,7 +124,7 @@ def __determine_author(generated_score: float) -> Author:
 if __name__ == "__main__":
     import json
 
-    with open("etc/detector_config.json") as f:
+    with open("etc/configs/detector_config.json") as f:
         detector_config = json.load(f)
 
     detector = AggregatedDetector(
diff --git a/generated_text_detector/utils/model/roberta_classifier.py b/generated_text_detector/utils/model/roberta_classifier.py
@@ -99,5 +99,5 @@ def forward(
 
 
 if __name__ == "__main__":
-    model = RobertaClassifier.from_pretrained("SuperAnnotate/roberta-large-llm-content-detector")
+    model = RobertaClassifier.from_pretrained("SuperAnnotate/ai-detector")
     print(model)
diff --git a/generated_text_detector/utils/text_detector.py b/generated_text_detector/utils/text_detector.py
@@ -3,7 +3,6 @@
 from nltk.tokenize import sent_tokenize
 from transformers import RobertaTokenizer
 
-from generated_text_detector.controllers.schemas_type import Author
 from generated_text_detector.utils.model.roberta_classifier import RobertaClassifier
 
 
@@ -123,10 +122,10 @@ def detect(self, text: str) -> list[tuple[str, float]]:
 
 if __name__ == "__main__":
     detector = GeneratedTextDetector(
-        "SuperAnnotate/roberta-large-llm-content-detector",
+        "SuperAnnotate/ai-detector",
         "cuda:0"
     )
 
-    res = detector.detect_report("Hello, world!")
+    res = detector.detect("Hello, world!")
 
     print(res)

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`	`1`	`{`
`2`		`- "text_detector_model": "SuperAnnotate/roberta-large-llm-content-detector",`
	`2`	`+ "text_detector_model": "SuperAnnotate/ai-detector",`
`3`	`3`	`"code_default_probability": 0.5`
`4`	`4`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`fastapi==0.110.0`
	`2`	`+numpy==1.25.2`
`2`	`3`	`nltk==3.8.1`
`3`	`4`	`starlette==0.36.3`
`4`	`5`	`torch==2.2.1`