Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ readme = "README.md"

[tool.poetry.dependencies]
python = "^3.12"
boto3 = "^1.39.12"
boto3 = "^1.40.11"
constructs = "^10.4.2"
pytest = "^8.4.1"
aws-cdk-lib = "^2.206.0"
Expand Down
73 changes: 44 additions & 29 deletions src/functions/chunk_manifest_builder/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,18 @@ def handler(event, context):
response = table.get_item(Key={'id': interview_id})

if 'Item' not in response:
raise Exception(f"Interview transcription not found: {interview_id}")
raise Exception(
f"Interview transcription not found: {interview_id}")

interview_data = response['Item']
position_name = interview_data['position_name'] # Get from DynamoDB record
# Get from DynamoDB record
position_name = interview_data['position_name']
position_description = interview_data['position_description']

print(f"Retrieved from DynamoDB - position_name: {position_name}")

# Get full utterances from S3 (stored there due to DynamoDB size limits)
# Get full utterances from S3 (stored there due to DynamoDB size
# limits)
s3 = boto3.client('s3')
utterances_key = f"utterances/{interview_id}_utterances.json"

Expand All @@ -54,10 +57,13 @@ def handler(event, context):
print(f"Using default bucket name: {bucket_name}")

try:
utterances_response = s3.get_object(Bucket=bucket_name, Key=utterances_key)
utterances = json.loads(utterances_response['Body'].read().decode('utf-8'))
utterances_response = s3.get_object(
Bucket=bucket_name, Key=utterances_key)
utterances = json.loads(
utterances_response['Body'].read().decode('utf-8'))
except s3.exceptions.NoSuchKey:
raise Exception(f"Utterances file not found: s3://{bucket_name}/{utterances_key}")
raise Exception(
f"Utterances file not found: s3://{bucket_name}/{utterances_key}")
except Exception as e:
raise Exception(f"Error reading utterances from S3: {str(e)}")

Expand All @@ -74,25 +80,23 @@ def handler(event, context):

# Update processing status
table.update_item(
Key={'id': interview_id},
Key={
'id': interview_id},
UpdateExpression='SET processing_status = :status, chunk_count = :count',
ExpressionAttributeValues={
':status': 'chunked',
':count': len(chunks)
}
)

return {
'statusCode': 200,
'interview_id': interview_id,
'position_name': position_name,
'position_description': position_description,
'bucket_name': bucket_name,
'chunk_count': len(chunks),
'total_duration_ms': chunks[-1]['end_ms'] if chunks else 0,
'chunks': [{'chunk_index': c['chunk_index'], 'start_ms': c['start_ms'], 'end_ms': c['end_ms']} for c in
chunks]
}
':count': len(chunks)})

return {'statusCode': 200,
'interview_id': interview_id,
'position_name': position_name,
'position_description': position_description,
'bucket_name': bucket_name,
'chunk_count': len(chunks),
'total_duration_ms': chunks[-1]['end_ms'] if chunks else 0,
'chunks': [{'chunk_index': c['chunk_index'],
'start_ms': c['start_ms'],
'end_ms': c['end_ms']} for c in chunks]}

except Exception as e:
print(f"Error building chunk manifest: {str(e)}")
Expand Down Expand Up @@ -135,7 +139,8 @@ def build_chunks(utterances, position_name, interview_id):
# Check if we should end this chunk
if chunk_duration >= TARGET_CHUNK_DURATION_MS:
# Try to find a natural break point (end of candidate answer)
break_point = find_natural_break_point(utterances, current_idx, chunk_start_time, MAX_CHUNK_DURATION_MS)
break_point = find_natural_break_point(
utterances, current_idx, chunk_start_time, MAX_CHUNK_DURATION_MS)
if break_point != -1:
# Adjust chunk to natural break
chunk_utterances = utterances[current_start_idx:break_point + 1]
Expand All @@ -148,13 +153,16 @@ def build_chunks(utterances, position_name, interview_id):

# Ensure minimum chunk size unless it's the last chunk
if len(chunk_utterances) > 0:
chunk_duration = chunk_utterances[-1]['end_time_ms'] - chunk_start_time
if chunk_duration < MIN_CHUNK_DURATION_MS and current_idx < len(utterances) - 1:
chunk_duration = chunk_utterances[-1]['end_time_ms'] - \
chunk_start_time
if chunk_duration < MIN_CHUNK_DURATION_MS and current_idx < len(
utterances) - 1:
# Extend chunk to minimum duration
while current_idx < len(utterances):
utterance = utterances[current_idx]
chunk_utterances.append(utterance)
chunk_duration = utterance['end_time_ms'] - chunk_start_time
chunk_duration = utterance['end_time_ms'] - \
chunk_start_time
if chunk_duration >= MIN_CHUNK_DURATION_MS:
break
current_idx += 1
Expand Down Expand Up @@ -202,13 +210,19 @@ def build_chunks(utterances, position_name, interview_id):
return chunks


def find_natural_break_point(utterances, current_idx, chunk_start_time, max_duration_ms):
def find_natural_break_point(
utterances,
current_idx,
chunk_start_time,
max_duration_ms):
"""
Find a natural break point (end of candidate answer) within the maximum duration.
"""

# Look ahead for natural break (candidate finishing an answer)
for i in range(current_idx, min(len(utterances), current_idx + 20)): # Look at next 20 utterances
for i in range(
current_idx, min(
len(utterances), current_idx + 20)): # Look at next 20 utterances
utterance = utterances[i]

# Check if we're past max duration
Expand All @@ -232,7 +246,8 @@ def build_chunk_text(chunk_utterances):
lines = []
for utterance in chunk_utterances:
timestamp = format_timestamp(utterance['start_time_ms'])
lines.append(f"[{timestamp}] {utterance['speaker']}: {utterance['text']}")
lines.append(
f"[{timestamp}] {utterance['speaker']}: {utterance['text']}")

return '\n'.join(lines)

Expand Down
20 changes: 9 additions & 11 deletions src/functions/chunked_qa_extractor/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,13 +66,12 @@ def handler(event, context):
# Update chunk processing status
chunk_table = dynamodb.Table('interview_chunks')
chunk_table.update_item(
Key={'id': chunk_id},
Key={
'id': chunk_id},
UpdateExpression='SET processing_status = :status, qa_count = :count',
ExpressionAttributeValues={
':status': 'qa_extracted',
':count': saved_count
}
)
':count': saved_count})

return {
'statusCode': 200,
Expand All @@ -88,13 +87,12 @@ def handler(event, context):
# Update chunk with error status
chunk_table = dynamodb.Table('interview_chunks')
chunk_table.update_item(
Key={'id': chunk_id},
Key={
'id': chunk_id},
UpdateExpression='SET processing_status = :status, error_message = :error',
ExpressionAttributeValues={
':status': 'extraction_failed',
':error': str(e)
}
)
':error': str(e)})
raise


Expand All @@ -112,17 +110,17 @@ def extract_qa_from_chunk(bedrock_client, chunk_data):
model_id = inference_profile_arn

# Build the extraction prompt
system_prompt = """You are an expert at extracting structured Q&A data from interview transcripts.
system_prompt = """You are an expert at extracting structured Q&A data from interview transcripts.
You must return ONLY valid JSON with no additional text or explanation."""

user_prompt = f"""You'll receive a transcript segment with speakers and timestamps.
user_prompt = f"""You'll receive a transcript segment with speakers and timestamps.
Return ONLY valid JSON in this exact format:
{{
"qa": [
{{
"index": 0,
"q_text": "Question text from interviewer",
"a_text": "Complete answer text from candidate",
"a_text": "Complete answer text from candidate",
"a_start_ms": 123000,
"a_end_ms": 456000,
"confidence": "high"
Expand Down
17 changes: 9 additions & 8 deletions src/functions/qa_extractor/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,16 +70,16 @@ def extract_qa_pairs(bedrock_client, transcript):
"""

prompt = f"""
Please analyze the following Russian interview transcript and extract all question-answer pairs.
Please analyze the following Russian interview transcript and extract all question-answer pairs.

The transcript contains dialogue between an Interviewer and a Candidate, marked with "Interviewer:" and "Candidate:" prefixes.
The conversation is in Russian language.

Extract each question asked by the interviewer and the corresponding answer given by the candidate.

Return the result as a JSON array where each object has "question" and "answer" fields.
Keep the original Russian text in both question and answer fields.

Rules:
1. Only extract complete question-answer pairs
2. Questions should be from the Interviewer
Expand All @@ -88,10 +88,10 @@ def extract_qa_pairs(bedrock_client, transcript):
5. Skip small talk, introductions, and closing remarks
6. Focus on technical questions and substantial answers
7. Preserve the original Russian text - do not translate

Transcript:
{transcript}

Return only the JSON array, no additional text or explanation.
"""

Expand All @@ -110,7 +110,8 @@ def extract_qa_pairs(bedrock_client, transcript):
# Get the inference profile ARN from environment variables
inference_profile_arn = os.environ.get('BEDROCK_INFERENCE_PROFILE_ARN')
if not inference_profile_arn:
raise Exception("BEDROCK_INFERENCE_PROFILE_ARN environment variable not set")
raise Exception(
"BEDROCK_INFERENCE_PROFILE_ARN environment variable not set")

response = bedrock_client.invoke_model(
modelId=inference_profile_arn,
Expand Down
3 changes: 2 additions & 1 deletion src/functions/s3_ingest_handler/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ def handler(event, context):
bucket = record['s3']['bucket']['name']
key = unquote_plus(record['s3']['object']['key'])

# Extract position_name from path (e.g., "python_senior/1.m4a" -> "python_senior")
# Extract position_name from path (e.g., "python_senior/1.m4a" ->
# "python_senior")
path_parts = key.split('/')
if len(path_parts) < 2:
print(f"Invalid path structure: {key}")
Expand Down
Loading