11// PDF text and metadata extraction utilities
2+ import { PNG } from 'pngjs' ;
23import { OPS } from 'pdfjs-dist/legacy/build/pdf.mjs' ;
4+ /**
5+ * Encode raw pixel data to PNG format
6+ */
7+ const encodePixelsToPNG = ( pixelData , width , height , channels ) => {
8+ const png = new PNG ( { width, height } ) ;
9+ // Convert pixel data to RGBA format expected by pngjs
10+ if ( channels === 4 ) {
11+ // Already RGBA
12+ png . data = Buffer . from ( pixelData ) ;
13+ }
14+ else if ( channels === 3 ) {
15+ // RGB -> RGBA (add alpha channel)
16+ for ( let i = 0 ; i < width * height ; i ++ ) {
17+ const srcIdx = i * 3 ;
18+ const dstIdx = i * 4 ;
19+ png . data [ dstIdx ] = pixelData [ srcIdx ] ?? 0 ; // R
20+ png . data [ dstIdx + 1 ] = pixelData [ srcIdx + 1 ] ?? 0 ; // G
21+ png . data [ dstIdx + 2 ] = pixelData [ srcIdx + 2 ] ?? 0 ; // B
22+ png . data [ dstIdx + 3 ] = 255 ; // A (fully opaque)
23+ }
24+ }
25+ else if ( channels === 1 ) {
26+ // Grayscale -> RGBA
27+ for ( let i = 0 ; i < width * height ; i ++ ) {
28+ const gray = pixelData [ i ] ?? 0 ;
29+ const dstIdx = i * 4 ;
30+ png . data [ dstIdx ] = gray ; // R
31+ png . data [ dstIdx + 1 ] = gray ; // G
32+ png . data [ dstIdx + 2 ] = gray ; // B
33+ png . data [ dstIdx + 3 ] = 255 ; // A
34+ }
35+ }
36+ // Encode to PNG and convert to base64
37+ const pngBuffer = PNG . sync . write ( png ) ;
38+ return pngBuffer . toString ( 'base64' ) ;
39+ } ;
340/**
441 * Extract metadata and page count from a PDF document
542 */
@@ -78,38 +115,83 @@ const extractImagesFromPage = async (page, pageNum) => {
78115 imageIndices . push ( i ) ;
79116 }
80117 }
81- // Extract each image using Promise-based approach
118+ // Extract each image - try sync first, then async if needed
82119 const imagePromises = imageIndices . map ( ( imgIndex , arrayIndex ) => new Promise ( ( resolve ) => {
83120 const argsArray = operatorList . argsArray [ imgIndex ] ;
84121 if ( ! argsArray || argsArray . length === 0 ) {
85122 resolve ( null ) ;
86123 return ;
87124 }
88125 const imageName = argsArray [ 0 ] ;
89- // Use callback-based get() as images may not be resolved yet
90- page . objs . get ( imageName , ( imageData ) => {
126+ // Helper to process image data
127+ const processImageData = ( imageData ) => {
91128 if ( ! imageData || typeof imageData !== 'object' ) {
92- resolve ( null ) ;
93- return ;
129+ return null ;
94130 }
95131 const img = imageData ;
96132 if ( ! img . data || ! img . width || ! img . height ) {
97- resolve ( null ) ;
98- return ;
133+ return null ;
99134 }
100- // Determine image format based on kind
101- // kind === 1 = grayscale, 2 = RGB, 3 = RGBA
135+ // Determine number of channels based on kind
136+ // kind === 1 = grayscale (1 channel), 2 = RGB (3 channels), 3 = RGBA (4 channels)
137+ const channels = img . kind === 1 ? 1 : img . kind === 3 ? 4 : 3 ;
102138 const format = img . kind === 1 ? 'grayscale' : img . kind === 3 ? 'rgba' : 'rgb' ;
103- // Convert Uint8Array to base64
104- const base64 = Buffer . from ( img . data ) . toString ( 'base64' ) ;
105- resolve ( {
139+ // Encode raw pixel data to PNG format
140+ const pngBase64 = encodePixelsToPNG ( img . data , img . width , img . height , channels ) ;
141+ return {
106142 page : pageNum ,
107143 index : arrayIndex ,
108144 width : img . width ,
109145 height : img . height ,
110146 format,
111- data : base64 ,
112- } ) ;
147+ data : pngBase64 ,
148+ } ;
149+ } ;
150+ // Try to get from commonObjs first if it starts with 'g_'
151+ if ( imageName . startsWith ( 'g_' ) ) {
152+ try {
153+ const imageData = page . commonObjs . get ( imageName ) ;
154+ if ( imageData ) {
155+ const result = processImageData ( imageData ) ;
156+ resolve ( result ) ;
157+ return ;
158+ }
159+ }
160+ catch ( error ) {
161+ const message = error instanceof Error ? error . message : String ( error ) ;
162+ console . warn ( `[PDF Reader MCP] Error getting image from commonObjs ${ imageName } : ${ message } ` ) ;
163+ }
164+ }
165+ // Try synchronous get first - if image is already loaded
166+ try {
167+ const imageData = page . objs . get ( imageName ) ;
168+ if ( imageData !== undefined ) {
169+ const result = processImageData ( imageData ) ;
170+ resolve ( result ) ;
171+ return ;
172+ }
173+ }
174+ catch ( error ) {
175+ // Synchronous get failed or not supported, fall through to async
176+ const message = error instanceof Error ? error . message : String ( error ) ;
177+ console . warn ( `[PDF Reader MCP] Sync image get failed for ${ imageName } , trying async: ${ message } ` ) ;
178+ }
179+ // Fallback to async callback-based get with timeout
180+ let resolved = false ;
181+ const timeout = setTimeout ( ( ) => {
182+ if ( ! resolved ) {
183+ resolved = true ;
184+ console . warn ( `[PDF Reader MCP] Image extraction timeout for ${ imageName } on page ${ String ( pageNum ) } ` ) ;
185+ resolve ( null ) ;
186+ }
187+ } , 10000 ) ; // 10 second timeout as a safety net
188+ page . objs . get ( imageName , ( imageData ) => {
189+ if ( ! resolved ) {
190+ resolved = true ;
191+ clearTimeout ( timeout ) ;
192+ const result = processImageData ( imageData ) ;
193+ resolve ( result ) ;
194+ }
113195 } ) ;
114196 } ) ) ;
115197 const resolvedImages = await Promise . all ( imagePromises ) ;
@@ -196,7 +278,7 @@ export const extractPageContent = async (pdfDocument, pageNum, includeImages, so
196278 imageIndices . push ( i ) ;
197279 }
198280 }
199- // Extract each image with its Y-coordinate
281+ // Extract each image with its Y-coordinate - try sync first, then async if needed
200282 const imagePromises = imageIndices . map ( ( imgIndex , arrayIndex ) => new Promise ( ( resolve ) => {
201283 const argsArray = operatorList . argsArray [ imgIndex ] ;
202284 if ( ! argsArray || argsArray . length === 0 ) {
@@ -205,32 +287,29 @@ export const extractPageContent = async (pdfDocument, pageNum, includeImages, so
205287 }
206288 const imageName = argsArray [ 0 ] ;
207289 // Get transform matrix from the args (if available)
208- // The transform is typically in argsArray[1] for some ops
209290 let yPosition = 0 ;
210291 if ( argsArray . length > 1 && Array . isArray ( argsArray [ 1 ] ) ) {
211292 const transform = argsArray [ 1 ] ;
212- // transform[5] is the Y coordinate
213293 const yCoord = transform [ 5 ] ;
214294 if ( yCoord !== undefined ) {
215295 yPosition = Math . round ( yCoord ) ;
216296 }
217297 }
218- // Use callback-based get() as images may not be resolved yet
219- page . objs . get ( imageName , ( imageData ) => {
298+ // Helper to process image data
299+ const processImageData = ( imageData ) => {
220300 if ( ! imageData || typeof imageData !== 'object' ) {
221- resolve ( null ) ;
222- return ;
301+ return null ;
223302 }
224303 const img = imageData ;
225304 if ( ! img . data || ! img . width || ! img . height ) {
226- resolve ( null ) ;
227- return ;
305+ return null ;
228306 }
229- // Determine image format based on kind
307+ // Determine number of channels based on kind
308+ const channels = img . kind === 1 ? 1 : img . kind === 3 ? 4 : 3 ;
230309 const format = img . kind === 1 ? 'grayscale' : img . kind === 3 ? 'rgba' : 'rgb' ;
231- // Convert Uint8Array to base64
232- const base64 = Buffer . from ( img . data ) . toString ( 'base64' ) ;
233- resolve ( {
310+ // Encode raw pixel data to PNG format
311+ const pngBase64 = encodePixelsToPNG ( img . data , img . width , img . height , channels ) ;
312+ return {
234313 type : 'image' ,
235314 yPosition,
236315 imageData : {
@@ -239,9 +318,54 @@ export const extractPageContent = async (pdfDocument, pageNum, includeImages, so
239318 width : img . width ,
240319 height : img . height ,
241320 format,
242- data : base64 ,
321+ data : pngBase64 ,
243322 } ,
244- } ) ;
323+ } ;
324+ } ;
325+ // Try to get from commonObjs first if it starts with 'g_'
326+ if ( imageName . startsWith ( 'g_' ) ) {
327+ try {
328+ const imageData = page . commonObjs . get ( imageName ) ;
329+ if ( imageData ) {
330+ const result = processImageData ( imageData ) ;
331+ resolve ( result ) ;
332+ return ;
333+ }
334+ }
335+ catch ( error ) {
336+ const message = error instanceof Error ? error . message : String ( error ) ;
337+ console . warn ( `[PDF Reader MCP] Error getting image from commonObjs ${ imageName } : ${ message } ` ) ;
338+ }
339+ }
340+ // Try synchronous get first - if image is already loaded
341+ try {
342+ const imageData = page . objs . get ( imageName ) ;
343+ if ( imageData !== undefined ) {
344+ const result = processImageData ( imageData ) ;
345+ resolve ( result ) ;
346+ return ;
347+ }
348+ }
349+ catch ( error ) {
350+ const message = error instanceof Error ? error . message : String ( error ) ;
351+ console . warn ( `[PDF Reader MCP] Sync image get failed for ${ imageName } , trying async: ${ message } ` ) ;
352+ }
353+ // Fallback to async callback-based get with timeout
354+ let resolved = false ;
355+ const timeout = setTimeout ( ( ) => {
356+ if ( ! resolved ) {
357+ resolved = true ;
358+ console . warn ( `[PDF Reader MCP] Image extraction timeout for ${ imageName } on page ${ String ( pageNum ) } ` ) ;
359+ resolve ( null ) ;
360+ }
361+ } , 10000 ) ; // 10 second timeout as a safety net
362+ page . objs . get ( imageName , ( imageData ) => {
363+ if ( ! resolved ) {
364+ resolved = true ;
365+ clearTimeout ( timeout ) ;
366+ const result = processImageData ( imageData ) ;
367+ resolve ( result ) ;
368+ }
245369 } ) ;
246370 } ) ) ;
247371 const resolvedImages = await Promise . all ( imagePromises ) ;
0 commit comments