@@ -62,6 +62,66 @@ def format_node(node: FileSystemNode, query: IngestionQuery) -> tuple[str, str,
6262 return summary , tree , content
6363
6464
65+ def format_node_with_context_limit (
66+ node : FileSystemNode ,
67+ query : IngestionQuery ,
68+ max_tokens : int
69+ ) -> tuple [str , str , str ]:
70+ """Generate optimized content that fits within token limit using greedy knapsack algorithm.
71+
72+ Uses relevance scores to prioritize files and maximize value within token constraints.
73+
74+ Parameters
75+ ----------
76+ node : FileSystemNode
77+ The file system node to be summarized.
78+ query : IngestionQuery
79+ The parsed query object containing information about the repository and query parameters.
80+ max_tokens : int
81+ Maximum tokens allowed for the output.
82+
83+ Returns
84+ -------
85+ tuple[str, str, str]
86+ A tuple containing the summary, directory structure, and optimized file contents.
87+ """
88+ is_single_file = node .type == FileSystemNodeType .FILE
89+ summary = _create_summary_prefix (query , single_file = is_single_file )
90+
91+ # Generate tree structure (always include this)
92+ tree = "Directory structure:\n " + _create_tree_structure (query , node = node )
93+ tree_tokens = _count_tokens (tree )
94+
95+ # Reserve tokens for summary and tree
96+ summary_base_tokens = _count_tokens (summary ) + 100 # 100 buffer for final summary additions
97+ available_tokens = max_tokens - tree_tokens - summary_base_tokens
98+
99+ if available_tokens <= 0 :
100+ # Not enough space even for tree, just return minimal content
101+ content = "[Content omitted - insufficient token space]"
102+ summary += f"\n Estimated tokens: { _format_token_count (summary + tree + content )} "
103+ return summary , tree , content
104+
105+ # Apply greedy knapsack algorithm to select optimal file contents
106+ optimized_content = _optimize_content_with_knapsack (node , available_tokens )
107+
108+ # Update summary with final info
109+ if node .type == FileSystemNodeType .DIRECTORY :
110+ # Count how many files were actually included
111+ included_files = len ([line for line in optimized_content .split ('\n ' ) if line .startswith ('=' * 48 )])
112+ summary += f"Files included: { included_files } (optimized for { max_tokens :,} tokens)\n "
113+ elif node .type == FileSystemNodeType .FILE :
114+ summary += f"File: { node .name } \n "
115+ summary += f"Lines: { len (node .content .splitlines ()):,} \n "
116+
117+ final_content = summary + "\n " + tree + "\n " + optimized_content
118+ token_estimate = _format_token_count (final_content )
119+ if token_estimate :
120+ summary += f"\n Estimated tokens: { token_estimate } "
121+
122+ return summary , tree , optimized_content
123+
124+
65125def _create_summary_prefix (query : IngestionQuery , * , single_file : bool = False ) -> str :
66126 """Create a prefix string for summarizing a repository or local directory.
67127
@@ -191,6 +251,27 @@ def _create_tree_structure(
191251 return tree_str
192252
193253
254+ def _count_tokens (text : str ) -> int :
255+ """Count actual tokens in text using tiktoken.
256+
257+ Parameters
258+ ----------
259+ text : str
260+ The text to count tokens for.
261+
262+ Returns
263+ -------
264+ int
265+ Number of tokens, or character/4 estimate if tiktoken fails.
266+ """
267+ try :
268+ encoding = tiktoken .get_encoding ("o200k_base" )
269+ return len (encoding .encode (text , disallowed_special = ()))
270+ except Exception :
271+ # Fallback to character-based estimation
272+ return len (text ) // 4
273+
274+
194275def _format_token_count (text : str ) -> str | None :
195276 """Return a human-readable token-count string (e.g. 1.2k, 1.2 M).
196277
@@ -206,8 +287,7 @@ def _format_token_count(text: str) -> str | None:
206287
207288 """
208289 try :
209- encoding = tiktoken .get_encoding ("o200k_base" ) # gpt-4o, gpt-4o-mini
210- total_tokens = len (encoding .encode (text , disallowed_special = ()))
290+ total_tokens = _count_tokens (text )
211291 except (ValueError , UnicodeEncodeError ) as exc :
212292 logger .warning ("Failed to estimate token size" , extra = {"error" : str (exc )})
213293 return None
@@ -221,3 +301,184 @@ def _format_token_count(text: str) -> str | None:
221301 return f"{ total_tokens / threshold :.1f} { suffix } "
222302
223303 return str (total_tokens )
304+
305+
306+ def _optimize_content_with_knapsack (node : FileSystemNode , max_tokens : int ) -> str :
307+ """Apply greedy knapsack algorithm to select optimal file contents within token limit.
308+
309+ Parameters
310+ ----------
311+ node : FileSystemNode
312+ Root node to extract files from.
313+ max_tokens : int
314+ Maximum tokens available for content.
315+
316+ Returns
317+ -------
318+ str
319+ Optimized content string with selected files.
320+ """
321+ # Collect all files with their metadata
322+ file_items = []
323+ _collect_file_items (node , file_items )
324+
325+ if not file_items :
326+ return "[No files found]"
327+
328+ # Calculate value/cost ratio for each file and sort by it
329+ for item in file_items :
330+ relevance_score = max (item ['relevance' ], 1 ) # Avoid division by zero
331+ file_type_multiplier = _get_file_type_multiplier (item ['path' ])
332+
333+ # Value = relevance * type_multiplier * content_quality
334+ content_quality = _estimate_content_quality (item ['content' ])
335+ value = relevance_score * file_type_multiplier * content_quality
336+
337+ # Cost = token count
338+ cost = item ['tokens' ]
339+
340+ # Ratio = value per token (higher is better)
341+ item ['ratio' ] = value / max (cost , 1 )
342+
343+ # Sort by ratio (descending - best value first)
344+ sorted_items = sorted (file_items , key = lambda x : x ['ratio' ], reverse = True )
345+
346+ # Greedy selection: pick highest ratio items that fit
347+ selected_items = []
348+ total_tokens = 0
349+
350+ for item in sorted_items :
351+ if total_tokens + item ['tokens' ] <= max_tokens :
352+ selected_items .append (item )
353+ total_tokens += item ['tokens' ]
354+
355+ # Build final content string
356+ if not selected_items :
357+ return "[No files fit within token limit]"
358+
359+ content_parts = []
360+ for item in selected_items :
361+ content_parts .append (item ['content_string' ])
362+
363+ result = "\n " .join (content_parts )
364+
365+ logger .info (
366+ f"Knapsack optimization: selected { len (selected_items )} /{ len (file_items )} files, "
367+ f"using { total_tokens } /{ max_tokens } tokens"
368+ )
369+
370+ return result
371+
372+
373+ def _collect_file_items (node : FileSystemNode , items : list ) -> None :
374+ """Recursively collect file metadata for knapsack optimization.
375+
376+ Parameters
377+ ----------
378+ node : FileSystemNode
379+ Current node to process.
380+ items : list
381+ List to append file items to.
382+ """
383+ if node .type == FileSystemNodeType .FILE :
384+ content_string = node .content_string
385+ tokens = _count_tokens (content_string )
386+
387+ items .append ({
388+ 'path' : node .path_str or node .name ,
389+ 'content' : node .content ,
390+ 'content_string' : content_string ,
391+ 'tokens' : tokens ,
392+ 'relevance' : node .likelihood_score ,
393+ 'size' : node .size ,
394+ 'node' : node
395+ })
396+
397+ elif node .type == FileSystemNodeType .DIRECTORY and node .children :
398+ for child in node .children :
399+ _collect_file_items (child , items )
400+
401+
402+ def _get_file_type_multiplier (file_path : str ) -> float :
403+ """Get relevance multiplier based on file type/name.
404+
405+ Parameters
406+ ----------
407+ file_path : str
408+ Path to the file.
409+
410+ Returns
411+ -------
412+ float
413+ Multiplier for this file type (higher = more important).
414+ """
415+ from pathlib import Path
416+
417+ path = Path (file_path )
418+ name_lower = path .name .lower ()
419+ ext_lower = path .suffix .lower ()
420+
421+ # High priority files
422+ if any (pattern in name_lower for pattern in ['readme' , 'main' , 'index' , 'app' , 'server' , '__init__' ]):
423+ return 2.0
424+
425+ # Important code files
426+ if ext_lower in {'.py' , '.js' , '.ts' , '.java' , '.cpp' , '.c' , '.go' , '.rs' , '.rb' }:
427+ return 1.5
428+
429+ # Config and setup files
430+ if ext_lower in {'.json' , '.yaml' , '.yml' , '.toml' , '.ini' , '.env' } or name_lower in {'dockerfile' , 'makefile' }:
431+ return 1.3
432+
433+ # Documentation
434+ if ext_lower in {'.md' , '.txt' , '.rst' }:
435+ return 1.1
436+
437+ # Default
438+ return 1.0
439+
440+
441+ def _estimate_content_quality (content : str ) -> float :
442+ """Estimate content quality/informativeness.
443+
444+ Parameters
445+ ----------
446+ content : str
447+ File content to analyze.
448+
449+ Returns
450+ -------
451+ float
452+ Quality score (higher = more informative).
453+ """
454+ if not content or content .strip () in ['[Binary file]' , '[Empty file]' , 'Error reading file' ]:
455+ return 0.1
456+
457+ lines = content .splitlines ()
458+ non_empty_lines = [line for line in lines if line .strip ()]
459+
460+ if not non_empty_lines :
461+ return 0.2
462+
463+ # Base score from content density
464+ density = len (non_empty_lines ) / max (len (lines ), 1 )
465+
466+ # Bonus for code-like content
467+ code_indicators = 0
468+ for line in non_empty_lines [:50 ]: # Check first 50 lines
469+ line_stripped = line .strip ()
470+ if any (indicator in line_stripped for indicator in ['def ' , 'class ' , 'function ' , 'import ' , 'from ' , 'const ' , 'let ' , 'var ' ]):
471+ code_indicators += 1
472+ if any (char in line_stripped for char in ['{' , '}' , '(' , ')' , ';' , ':' ]):
473+ code_indicators += 0.5
474+
475+ code_bonus = min (code_indicators / 10 , 1.0 )
476+
477+ # Penalty for very long files (diminishing returns)
478+ length_penalty = 1.0
479+ if len (lines ) > 1000 :
480+ length_penalty = 0.8
481+ elif len (lines ) > 2000 :
482+ length_penalty = 0.6
483+
484+ return (density + code_bonus ) * length_penalty
0 commit comments