Skip to content

Commit 3ae1584

Browse files
committed
Make the untranslated entry stripping script more robust to handle more complex cases
1 parent 30f4d42 commit 3ae1584

File tree

5 files changed

+2442
-391
lines changed

5 files changed

+2442
-391
lines changed

locales/extract-untranslated.sh

Lines changed: 138 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#!/bin/bash
2-
set -e
2+
set -ex # Enable verbose debugging
33

44
# Get the directory where the script is located
55
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
@@ -11,102 +11,164 @@ LOCALES_DIR="$SCRIPT_DIR"
1111
DOMAIN="messages"
1212
POT_FILE="$LOCALES_DIR/$DOMAIN.pot" # This needs to be an absolute path
1313

14-
# Extract untranslated and fuzzy entries
15-
echo "Extracting untranslated and fuzzy entries..."
14+
# AWK script to extract and clean msgid strings, handling multi-line and unescaping
15+
# This script is written to a temporary file to avoid issues with 'read -r -d'
16+
AWK_EXTRACT_MSGID_SCRIPT_PATH="$LOCALES_DIR/awk_extract_msgid.awk"
17+
cat << 'EOF_AWK_EXTRACT_MSGID' > "$AWK_EXTRACT_MSGID_SCRIPT_PATH"
18+
BEGIN {
19+
current_msgid_raw = "";
20+
in_msgid_block = 0;
21+
}
22+
23+
/^msgid / {
24+
if (in_msgid_block) {
25+
cleaned_msgid = current_msgid_raw;
26+
sub(/^msgid /, "", cleaned_msgid);
27+
if (length(cleaned_msgid) > 0 && substr(cleaned_msgid, 1, 1) == "\"" && substr(cleaned_msgid, length(cleaned_msgid), 1) == "\"") {
28+
cleaned_msgid = substr(cleaned_msgid, 2, length(cleaned_msgid) - 2);
29+
}
30+
gsub(/\n"/, "\n", cleaned_msgid);
31+
gsub(/\\"/, "\"", cleaned_msgid);
32+
gsub(/\\n/, "\n", cleaned_msgid);
33+
print cleaned_msgid;
34+
}
35+
current_msgid_raw = $0;
36+
in_msgid_block = 1;
37+
next;
38+
}
39+
40+
/^msgstr / {
41+
cleaned_msgid = current_msgid_raw;
42+
sub(/^msgid /, "", cleaned_msgid);
43+
if (length(cleaned_msgid) > 0 && substr(cleaned_msgid, 1, 1) == "\"" && substr(cleaned_msgid, length(cleaned_msgid), 1) == "\"") {
44+
cleaned_msgid = substr(cleaned_msgid, 2, length(cleaned_msgid) - 2);
45+
}
46+
gsub(/\n"/, "\n", cleaned_msgid);
47+
gsub(/\\"/, "\"", cleaned_msgid);
48+
gsub(/\\n/, "\n", cleaned_msgid);
49+
print cleaned_msgid;
50+
51+
current_msgid_raw = "";
52+
in_msgid_block = 0;
53+
next;
54+
}
55+
56+
/^"/ {
57+
if (in_msgid_block) {
58+
current_msgid_raw = current_msgid_raw "\n" $0;
59+
}
60+
next;
61+
}
62+
63+
/^#/ {
64+
next;
65+
}
66+
67+
/^$/ {
68+
if (in_msgid_block) {
69+
cleaned_msgid = current_msgid_raw;
70+
sub(/^msgid /, "", cleaned_msgid);
71+
if (length(cleaned_msgid) > 0 && substr(cleaned_msgid, 1, 1) == "\"" && substr(cleaned_msgid, length(cleaned_msgid), 1) == "\"") {
72+
cleaned_msgid = substr(cleaned_msgid, 2, length(cleaned_msgid) - 2);
73+
}
74+
gsub(/\n"/, "\n", cleaned_msgid);
75+
gsub(/\\"/, "\"", cleaned_msgid);
76+
gsub(/\\n/, "\n", cleaned_msgid);
77+
print cleaned_msgid;
78+
}
79+
current_msgid_raw = "";
80+
in_msgid_block = 0;
81+
next;
82+
}
83+
84+
END {
85+
if (in_msgid_block && current_msgid_raw != "") {
86+
cleaned_msgid = current_msgid_raw;
87+
sub(/^msgid /, "", cleaned_msgid);
88+
if (length(cleaned_msgid) > 0 && substr(cleaned_msgid, 1, 1) == "\"" && substr(cleaned_msgid, length(cleaned_msgid), 1) == "\"") {
89+
cleaned_msgid = substr(cleaned_msgid, 2, length(cleaned_msgid) - 2);
90+
}
91+
gsub(/\n"/, "\n", cleaned_msgid);
92+
gsub(/\\"/, "\"", cleaned_msgid);
93+
gsub(/\\n/, "\n", cleaned_msgid);
94+
print cleaned_msgid;
95+
}
96+
}
97+
EOF_AWK_EXTRACT_MSGID
98+
99+
echo "Extracting untranslated and missing entries..."
16100
for po in "$LOCALES_DIR"/*/LC_MESSAGES/$DOMAIN.po; do
17101
[ -f "$po" ] || continue
18102
lang=$(basename "$(dirname "$(dirname "$po")")")
19103
tmpfile="$LOCALES_DIR/untranslated_${lang}.tmp"
104+
tmpfile_pot_msgids="$LOCALES_DIR/all_pot_msgids.tmp"
105+
tmpfile_po_translated_msgids="$LOCALES_DIR/po_translated_msgids_${lang}.tmp"
20106

21-
# Clear the tmpfile first
107+
# Clear tmp files
22108
: > "$tmpfile"
109+
: > "$tmpfile_pot_msgids"
110+
: > "$tmpfile_po_translated_msgids"
23111

24-
# Extract untranslated messages by directly parsing the .po file
25-
awk -f - "$po" > "$tmpfile" << 'EOF_AWK'
26-
BEGIN {
27-
current_msgid = "";
28-
current_msgstr = "";
29-
is_fuzzy = 0;
30-
in_entry = 0; # 0: outside entry, 1: in msgid, 2: in msgstr
31-
}
112+
echo "--- Processing language: $lang ---"
32113

33-
# Handle comments and fuzzy flag
34-
/^#/ {
35-
if ($0 ~ /#, fuzzy/) {
36-
is_fuzzy = 1;
37-
}
38-
next;
39-
}
114+
# 1. Extract all msgids from the .pot file
115+
echo "Step 1: Extracting all msgids from $POT_FILE..."
116+
awk -f "$AWK_EXTRACT_MSGID_SCRIPT_PATH" "$POT_FILE" | sort -u > "$tmpfile_pot_msgids"
117+
echo "Step 1 Complete: All msgids from $POT_FILE extracted to $tmpfile_pot_msgids"
40118

41-
# Start of a new msgid
119+
# 2. Extract all *translated* msgids from the .po file
120+
echo "Step 2: Extracting translated msgids from $po..."
121+
grep -P -A 1 '^msgid ' "$po" | awk '
122+
BEGIN {
123+
current_msgid_block = "";
124+
in_msgid_section = 0;
125+
}
42126
/^msgid / {
43-
# Process the previous entry before starting a new one
44-
if (in_entry == 2 && current_msgid != "" && current_msgstr == "" && is_fuzzy == 0) {
45-
print current_msgid "|||" current_msgstr;
46-
}
47-
48-
# Reset for the new entry
49-
current_msgid = $0;
50-
sub(/^msgid /, "", current_msgid); # Remove "msgid "
51-
# Remove leading and trailing quotes from msgid
52-
if (current_msgid ~ /^".*"$/) {
53-
current_msgid = substr(current_msgid, 2, length(current_msgid) - 2);
54-
}
55-
56-
current_msgstr = "";
57-
is_fuzzy = 0;
58-
in_entry = 1; # Now in msgid block
127+
current_msgid_block = $0;
128+
in_msgid_section = 1;
59129
next;
60130
}
61-
62-
# Start of a new msgstr
63-
/^msgstr / {
64-
current_msgstr = $0;
65-
sub(/^msgstr /, "", current_msgstr); # Remove "msgstr "
66-
# Remove leading and trailing quotes from msgstr
67-
if (current_msgstr ~ /^".*"$/) {
68-
current_msgstr = substr(current_msgstr, 2, length(current_msgstr) - 2);
131+
/^msgstr "[^"]+"$/ { # msgstr is not empty
132+
if (in_msgid_section) {
133+
print current_msgid_block; # Print the msgid block
69134
}
70-
in_entry = 2; # Now in msgstr block
135+
in_msgid_section = 0;
136+
current_msgid_block = "";
71137
next;
72138
}
73-
74-
# Continuation lines (quoted strings)
75-
/^"/ {
76-
line_content = $0;
77-
# Remove leading and trailing quotes from continuation lines
78-
if (line_content ~ /^".*"$/) {
79-
line_content = substr(line_content, 2, length(line_content) - 2);
80-
}
81-
82-
if (in_entry == 1) { # Appending to msgid
83-
current_msgid = current_msgid line_content;
84-
} else if (in_entry == 2) { # Appending to msgstr
85-
current_msgstr = current_msgstr line_content;
139+
/^"/ { # Continuation lines for msgid
140+
if (in_msgid_section) {
141+
current_msgid_block = current_msgid_block "\n" $0;
86142
}
87143
next;
88144
}
89-
90-
# Empty line (marks end of an entry)
91-
/^$/ {
92-
# Check if the completed entry is untranslated and not fuzzy
93-
if (in_entry == 2 && current_msgid != "" && current_msgstr == "" && is_fuzzy == 0) {
94-
print current_msgid "|||" current_msgstr;
95-
}
96-
# Reset for the next entry
97-
current_msgid = "";
98-
current_msgstr = "";
99-
is_fuzzy = 0;
100-
in_entry = 0;
145+
/^#/ { next; } # Ignore comments
146+
/^$/ { # Empty line, end of entry
147+
in_msgid_section = 0;
148+
current_msgid_block = "";
101149
next;
102150
}
103-
104-
# End of file: process the last entry if it exists
105151
END {
106-
if (in_entry == 2 && current_msgid != "" && current_msgstr == "" && is_fuzzy == 0) {
107-
print current_msgid "|||" current_msgstr;
152+
# Handle case where last entry is a translated msgid
153+
if (in_msgid_section && current_msgid_block != "") {
154+
# This case is tricky, as we only print if msgstr is non-empty.
155+
# The grep -A 1 should handle this by providing the msgstr line.
156+
# So, no need to print here, as it would have been printed by the /^msgstr/ block.
108157
}
109158
}
110-
EOF_AWK
111-
echo "Untranslated entries written to: $tmpfile"
159+
' | awk -f "$AWK_EXTRACT_MSGID_SCRIPT_PATH" - | sort -u > "$tmpfile_po_translated_msgids"
160+
echo "Step 2 Complete: Translated msgids from $po extracted to $tmpfile_po_translated_msgids"
161+
162+
# 3. Compare the two lists to find untranslated/missing entries
163+
echo "Step 3: Comparing msgids to find untranslated and missing entries..."
164+
comm -23 "$tmpfile_pot_msgids" "$tmpfile_po_translated_msgids" > "$tmpfile"
165+
echo "Step 3 Complete: Untranslated and missing entries written to: $tmpfile"
166+
167+
# Clean up temporary files for the current language
168+
echo "Cleaning up temporary files for language $lang..."
169+
rm "$tmpfile_pot_msgids" "$tmpfile_po_translated_msgids"
112170
done
171+
172+
# Clean up the AWK script file after all languages are processed
173+
echo "Cleaning up AWK script file..."
174+
rm "$AWK_EXTRACT_MSGID_SCRIPT_PATH"

0 commit comments

Comments
 (0)