|
30 | 30 |
|
31 | 31 | @dataclass |
32 | 32 | class TaskData: |
33 | | - """Base interface exposed by the different task types (MLM, NER, etc) to training scripts |
| 33 | + """Base data interface exposed by the different task types (MLM, NER, etc) to training scripts |
34 | 34 |
|
35 | 35 | Each new task module should implement a method get_task(data_args, tokenizer) -> TaskData |
36 | 36 | """ |
@@ -70,7 +70,13 @@ def split( |
70 | 70 |
|
71 | 71 |
|
72 | 72 | class NaiveExampleSplitter(ExampleSplitterBase): |
73 | | - """Split sequences by word, and pull final sequence start forward to fill max allowable length""" |
| 73 | + """Split sequences by word, and pull final sequence start forward if it comes up <50% max len |
| 74 | +
|
| 75 | + This algorithm produces examples by splitting tokens on word boundaries, extending each sample |
| 76 | + until max_content_seq_len is filled. *IF* the final generated example is less than 50% of the |
| 77 | + maximum tokens, its start index will be pulled forward to consume as many words as will fit. |
| 78 | + Apart from this, there will be no overlap between examples. |
| 79 | + """ |
74 | 80 |
|
75 | 81 | @classmethod |
76 | 82 | def n_examples(cls, n_tokens: int, max_content_seq_len: int) -> int: |
@@ -143,7 +149,15 @@ def split( |
143 | 149 |
|
144 | 150 |
|
145 | 151 | class TextractLayoutLMDatasetBase(Dataset): |
146 | | - """Base class for PyTorch/Hugging Face dataset using Amazon Textract for LayoutLM-based models""" |
| 152 | + """Base class for PyTorch/Hugging Face dataset using Amazon Textract for LayoutLM-based models |
| 153 | +
|
| 154 | + The base dataset assumes fixed/known length, which typically requires analyzing the source data |
| 155 | + on init - but avoids the complications of shuffling iterable dataset samples in a multi-process |
| 156 | + environment, or introducing SageMaker Pipe Mode and RecordIO formats. |
| 157 | +
|
| 158 | + Source data is provided as a folder of Amazon Textract result JSONs, with an optional JSONLines |
| 159 | + manifest file annotating the documents in case the task is supervised. |
| 160 | + """ |
147 | 161 |
|
148 | 162 | def __init__( |
149 | 163 | self, |
@@ -286,8 +300,8 @@ def max_content_seq_len(self): |
286 | 300 | class DummyDataCollator: |
287 | 301 | """Data collator that just stacks tensors from inputs. |
288 | 302 |
|
289 | | - For use with Dataset classes where the leg-work is already done and HF's default |
290 | | - "DataCollatorWithPadding" should explicitly *not* be used. |
| 303 | + For use with Dataset classes where the tokenization and collation leg-work is already done and |
| 304 | + HF's default "DataCollatorWithPadding" should explicitly *not* be used. |
291 | 305 | """ |
292 | 306 |
|
293 | 307 | def __call__(self, features): |
|
0 commit comments