Handle multiple code blocks per fragment

nirinchev · nirinchev · commit 0b1c66a213f3 · 2024-10-22T14:10:26.000+02:00
diff --git a/src/participant/streamParsing.ts b/src/participant/streamParsing.ts
@@ -86,20 +86,24 @@ class FragmentMatcher {
   private _endMatcher: StreamingKMP;
   private _matchedContent?: string;
   private _onContentMatched: (content: string) => void;
+  private _onFragmentProcessed: (content: string) => void;
 
   constructor({
     identifier,
     onContentMatched,
+    onFragmentProcessed,
   }: {
     identifier: {
       start: string;
       end: string;
     };
     onContentMatched: (content: string) => void;
+    onFragmentProcessed: (content: string) => void;
   }) {
     this._startMatcher = new StreamingKMP(identifier.start);
     this._endMatcher = new StreamingKMP(identifier.end);
     this._onContentMatched = onContentMatched;
+    this._onFragmentProcessed = onFragmentProcessed;
   }
 
   private _contentMatched(): void {
@@ -116,6 +120,18 @@ class FragmentMatcher {
     this._endMatcher.reset();
   }
 
+  // This needs to be invoked every time before we call `process` recursively or when `process`
+  // completes processing the fragment. It will emit a notification to subscribers with the partial
+  // fragment we've processed, regardless of whether there's a match or not.
+  private _partialFragmentProcessed(
+    fragment: string,
+    index: number | undefined = undefined
+  ): void {
+    this._onFragmentProcessed(
+      index === undefined ? fragment : fragment.slice(0, index)
+    );
+  }
+
   public process(fragment: string): void {
     if (this._matchedContent === undefined) {
       // We haven't matched the start identifier yet, so try and do that
@@ -124,19 +140,24 @@ class FragmentMatcher {
         // We found a match for the start identifier - update `_matchedContent` to an empty string
         // and recursively call `process` with the remainder of the fragment.
         this._matchedContent = '';
+        this._partialFragmentProcessed(fragment, startIndex);
         this.process(fragment.slice(startIndex));
+      } else {
+        this._partialFragmentProcessed(fragment);
       }
     } else {
       const endIndex = this._endMatcher.match(fragment);
       if (endIndex !== -1) {
         // We've matched the end - emit the matched content and continue processing the partial fragment
         this._matchedContent += fragment.slice(0, endIndex);
+        this._partialFragmentProcessed(fragment, endIndex);
         this._contentMatched();
         this.process(fragment.slice(endIndex));
       } else {
         // We haven't matched the end yet - append the fragment to the matched content and wait
         // for a future fragment to contain the end identifier.
         this._matchedContent += fragment;
+        this._partialFragmentProcessed(fragment);
       }
     }
   }
@@ -165,10 +186,10 @@ export async function processStreamWithIdentifiers({
   const fragmentMatcher = new FragmentMatcher({
     identifier,
     onContentMatched: onStreamIdentifier,
+    onFragmentProcessed: processStreamFragment,
   });
 
   for await (const fragment of inputIterable) {
-    processStreamFragment(fragment);
     fragmentMatcher.process(fragment);
   }
 }
diff --git a/src/test/suite/participant/streamParsing.test.ts b/src/test/suite/participant/streamParsing.test.ts
@@ -216,4 +216,80 @@ suite('processStreamWithIdentifiers', () => {
     expect(fragmentsProcessed.join('')).to.deep.equal(inputFragments.join(''));
     expect(identifiersStreamed).to.deep.equal(['\ncode1\n', '\ncode2\n']);
   });
+
+  test('one fragment containing multiple code blocks emits event in correct order', async () => {
+    // In case we have one fragment containing multiple code blocks, we want to make sure that
+    // fragment notifications and identifier notifications arrive in the right order so that we're
+    // adding code actions after the correct subfragment.
+    // For example:
+    // 'Text before code.\n```js\ncode1\n```\nText between code.\n```js\ncode2\n```\nText after code.'
+    //
+    // should emit:
+    //
+    // processStreamFragment: 'Text before code.\n```js\ncode1\n```'
+    // onStreamIdentifier: '\ncode1\n'
+    // processStreamFragment: '\nText between code.\n```js\ncode2\n```'
+    // onStreamIdentifier: '\ncode2\n'
+    // processStreamFragment: '\nText after code.'
+    //
+    // in that order to ensure we add each code action immediately after the code block
+    // rather than add both at the end.
+
+    const inputFragments = [
+      'Text before code.\n```js\ncode1\n```\nText between code.\n```js\ncode2\n```\nText after code.',
+    ];
+
+    const inputIterable = asyncIterableFromArray<string>(inputFragments);
+    const identifier = { start: '```js', end: '```' };
+
+    const fragmentsEmitted: {
+      source: 'processStreamFragment' | 'onStreamIdentifier';
+      content: string;
+    }[] = [];
+
+    const getFragmentHandler = (
+      source: 'processStreamFragment' | 'onStreamIdentifier'
+    ): ((fragment: string) => void) => {
+      return (fragment: string): void => {
+        // It's an implementation detail, but the way the code is structured today, we're splitting the emitted fragments
+        // whenever we find either a start or end identifier. This is irrelevant as long as we're emitting the entirety of
+        // the text until the end of the code block in `processStreamFragment` and then the code block itself in `onStreamIdentifier`.
+        // With the code below, we're combining all subfragments with the same source to make the test verify the desired
+        // behavior rather than the actual implementation.
+        const lastFragment = fragmentsEmitted[fragmentsEmitted.length - 1];
+        if (lastFragment?.source === source) {
+          lastFragment.content += fragment;
+        } else {
+          fragmentsEmitted.push({ source, content: fragment });
+        }
+      };
+    };
+
+    await processStreamWithIdentifiers({
+      processStreamFragment: getFragmentHandler('processStreamFragment'),
+      onStreamIdentifier: getFragmentHandler('onStreamIdentifier'),
+      inputIterable,
+      identifier,
+    });
+
+    expect(fragmentsEmitted).to.have.length(5);
+    expect(fragmentsEmitted[0].source).to.equal('processStreamFragment');
+    expect(fragmentsEmitted[0].content).to.equal(
+      'Text before code.\n```js\ncode1\n```'
+    );
+
+    expect(fragmentsEmitted[1].source).to.equal('onStreamIdentifier');
+    expect(fragmentsEmitted[1].content).to.equal('\ncode1\n');
+
+    expect(fragmentsEmitted[2].source).to.equal('processStreamFragment');
+    expect(fragmentsEmitted[2].content).to.equal(
+      '\nText between code.\n```js\ncode2\n```'
+    );
+
+    expect(fragmentsEmitted[3].source).to.equal('onStreamIdentifier');
+    expect(fragmentsEmitted[3].content).to.equal('\ncode2\n');
+
+    expect(fragmentsEmitted[4].source).to.equal('processStreamFragment');
+    expect(fragmentsEmitted[4].content).to.equal('\nText after code.');
+  });
 });