blob: cc2f1c5dbfadff6bf11c8f0105006139d7bcb365 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
|
#!/bin/sh
# Define input and output files
URL="https://unicode.org/Public/emoji/latest/emoji-test.txt"
INPUT_FILE="${XDG_DATA_HOME:-${HOME}/.local/share}/thesiah/char/emoji_raw"
TEMP_FILE="${XDG_DATA_HOME:-${HOME}/.local/share}/thesiah/char/emoji_temp"
OUTPUT_FILE="${XDG_DATA_HOME:-${HOME}/.local/share}/thesiah/char/emoji"
# Create the directory for output files if it doesn't exist
mkdir -p "$(dirname "$INPUT_FILE")"
# Download the emoji file
echo "Downloading emoji-test.txt from Unicode..."
if curl -o "$INPUT_FILE" -L "$URL"; then
echo "Download complete! File saved to: $INPUT_FILE"
else
echo "Failed to download emoji"
exit 1
fi
awk '
# Skip empty lines and comments
/^[[:space:]]*$/ || /^#/ { next }
# Keep only fully-qualified lines
!/(fully-qualified|component)/ { next }
# Skip lines containing 200D (zero-width joiner)
/200D/ { next }
# Skip lines containing components
$2 ~ /1F3F[BCDEF]/ { next }
# Print valid lines
{ print }
' "$INPUT_FILE" >"$TEMP_FILE"
# Second stage: Extract emoji and description
awk -F'#' '
{
if (NF >= 2) {
full_data = $2 # Extract the emoji and description (after #)
gsub(/^[[:space:]]+|[[:space:]]+$/, "", full_data) # Trim spaces around the entire field
split(full_data, parts, " ") # Split into parts by spaces
emoji = parts[1] # First part is the emoji
# Reconstruct description from parts[3] onward
description = ""
for (i = 3; i <= length(parts); i++) {
description = description parts[i] " "
}
# Remove excessive internal spaces and trim description
gsub(/[[:space:]]+/, " ", description)
gsub(/^[[:space:]]+|[[:space:]]+$/, "", description)
# Print emoji and description
print emoji, description
}
}
' "$TEMP_FILE" >"$OUTPUT_FILE"
rm -rf "$INPUT_FILE" "$TEMP_FILE"
echo "Processing complete! File saved to: $OUTPUT_FILE"
|