doc(model): minor comment clarifications

athewsey · athewsey · commit 6ef8a6f47bbb · 2021-11-18T12:13:15.000+08:00
diff --git a/notebooks/src/code/config.py b/notebooks/src/code/config.py
@@ -74,7 +74,7 @@ class SageMakerTrainingArguments(TrainingArguments):
         metadata={"help": "The evaluation strategy to use."},
     )
     save_strategy: IntervalStrategy = field(
-        # We'd like some eval metrics by default, rather than the usual "no" strategy
+        # Should match evaluation strategy for early stopping to work
         default="epoch",
         metadata={"help": "The model save strategy to use."},
     )
diff --git a/notebooks/src/code/data/base.py b/notebooks/src/code/data/base.py
@@ -30,7 +30,7 @@
 
 @dataclass
 class TaskData:
-    """Base interface exposed by the different task types (MLM, NER, etc) to training scripts
+    """Base data interface exposed by the different task types (MLM, NER, etc) to training scripts
 
     Each new task module should implement a method get_task(data_args, tokenizer) -> TaskData
     """
@@ -70,7 +70,13 @@ def split(
 
 
 class NaiveExampleSplitter(ExampleSplitterBase):
-    """Split sequences by word, and pull final sequence start forward to fill max allowable length"""
+    """Split sequences by word, and pull final sequence start forward if it comes up <50% max len
+
+    This algorithm produces examples by splitting tokens on word boundaries, extending each sample
+    until max_content_seq_len is filled. *IF* the final generated example is less than 50% of the
+    maximum tokens, its start index will be pulled forward to consume as many words as will fit.
+    Apart from this, there will be no overlap between examples.
+    """
 
     @classmethod
     def n_examples(cls, n_tokens: int, max_content_seq_len: int) -> int:
@@ -143,7 +149,15 @@ def split(
 
 
 class TextractLayoutLMDatasetBase(Dataset):
-    """Base class for PyTorch/Hugging Face dataset using Amazon Textract for LayoutLM-based models"""
+    """Base class for PyTorch/Hugging Face dataset using Amazon Textract for LayoutLM-based models
+
+    The base dataset assumes fixed/known length, which typically requires analyzing the source data
+    on init - but avoids the complications of shuffling iterable dataset samples in a multi-process
+    environment, or introducing SageMaker Pipe Mode and RecordIO formats.
+
+    Source data is provided as a folder of Amazon Textract result JSONs, with an optional JSONLines
+    manifest file annotating the documents in case the task is supervised.
+    """
 
     def __init__(
         self,
@@ -286,8 +300,8 @@ def max_content_seq_len(self):
 class DummyDataCollator:
     """Data collator that just stacks tensors from inputs.
 
-    For use with Dataset classes where the leg-work is already done and HF's default
-    "DataCollatorWithPadding" should explicitly *not* be used.
+    For use with Dataset classes where the tokenization and collation leg-work is already done and
+    HF's default "DataCollatorWithPadding" should explicitly *not* be used.
     """
 
     def __call__(self, features):
diff --git a/notebooks/src/code/data/geometry.py b/notebooks/src/code/data/geometry.py
@@ -14,8 +14,8 @@
 class AnnotationBoundingBox:
     """Class to parse a bounding box annotated by SageMaker Ground Truth Object Detection
 
-    Calculates all box TLHWBR metrics (both absolute and relative) on init, for efficient and easy
-    processing later.
+    Pre-calculates all box TLHWBR metrics (both absolute and relative) on init, for efficient and
+    easy processing later.
     """
 
     def __init__(self, manifest_box: dict, image_height: int, image_width: int):
@@ -90,6 +90,14 @@ class BoundingBoxAnnotationResult:
     """Class to parse the result field saved by a SageMaker Ground Truth Object Detection job"""
 
     def __init__(self, manifest_obj: dict):
+        """Initialize a BoundingBoxAnnotationResult
+
+        Arguments
+        ---------
+        manifest_obj : dict
+            The contents of the output field of a record in a SMGT Object Detection labelling job
+            output manifest, or equivalent.
+        """
         try:
             image_size_spec = manifest_obj["image_size"][0]
             self._image_height = int(image_size_spec["height"])
@@ -101,9 +109,9 @@ def __init__(self, manifest_obj: dict):
             raise ValueError(
                 "".join(
                     (
-                        "manifest_obj must be a dictionary including 'image_size': a list of length 1 ",
-                        "whose first/only element is a dict with integer properties 'height' and ",
-                        f"'width', optionally also 'depth'. Got: {manifest_obj}",
+                        "manifest_obj must be a dictionary including 'image_size': a list of ",
+                        "length 1 whose first/only element is a dict with integer properties ",
+                        f"'height' and 'width', optionally also 'depth'. Got: {manifest_obj}",
                     )
                 )
             ) from e
diff --git a/notebooks/src/code/data/mlm.py b/notebooks/src/code/data/mlm.py
@@ -35,9 +35,9 @@ class TextractLayoutLMDataCollatorForLanguageModelling(DataCollatorForLanguageMo
     """Collator to process (batches of) Examples into batched model inputs
 
     For this case, tokenization can happen at the batch level which allows us to pad to the longest
-    sample in batch rather than the overall model max_seq_len. Word splitting is already done by
-    Textract, and some custom logic is required to feed through the bounding box inputs from
-    Textract (at word level) to the model inputs (at token level).
+    sample in batch rather than the overall model max_seq_len - for efficiency. Word splitting is
+    already done by Textract, and some custom logic is required to feed through the bounding box
+    inputs from Textract (at word level) to the model inputs (at token level).
     """
 
     bos_token_box: Tuple[int, int, int, int] = (0, 0, 0, 0)
diff --git a/notebooks/util/training.py b/notebooks/util/training.py
@@ -10,10 +10,12 @@ def get_hf_metric_regex(metric_name: str) -> str:
     {'eval_loss': 0.3940396010875702, ..., 'epoch': 1.0}
     """
     scientific_number_exp = r"(-?[0-9]+(\.[0-9]+)?(e[+\-][0-9]+)?)"
-    return "".join((
-        "'",
-        metric_name,
-        "': ",
-        scientific_number_exp,
-        "[,}]",
-    ))
+    return "".join(
+        (
+            "'",
+            metric_name,
+            "': ",
+            scientific_number_exp,
+            "[,}]",
+        )
+    )

Original file line number	Diff line number	Diff line change
`@@ -74,7 +74,7 @@ class SageMakerTrainingArguments(TrainingArguments):`
`74`	`74`	`metadata={"help": "The evaluation strategy to use."},`
`75`	`75`	`)`
`76`	`76`	`save_strategy: IntervalStrategy = field(`
`77`		`- # We'd like some eval metrics by default, rather than the usual "no" strategy`
	`77`	`+ # Should match evaluation strategy for early stopping to work`
`78`	`78`	`default="epoch",`
`79`	`79`	`metadata={"help": "The model save strategy to use."},`
`80`	`80`	`)`