ar/.config/ncmpcpp/genius-lyrics.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175

#!/usr/bin/env python3
"""Fetch lyrics from Genius for the current (or given) song and write them
in the flat `<artist> - <title>.txt` form that ncmpcpp reads.

Why this exists: ncmpcpp 0.10.1's built-in Genius scraper stops at the first
</div> inside the lyrics container. Genius now nests a LyricsHeader div there,
so ncmpcpp only captures the header ("N Contributors<title> Lyrics") and drops
the actual lyrics. This script parses the current Genius structure correctly.

Usage:
    genius-lyrics.py                # uses `mpc current` for artist/title
    genius-lyrics.py "Artist" "Title"
    genius-lyrics.py --force ...    # refetch even if a good file already exists

Designed to be safe to call from ncmpcpp's execute_on_song_change: it never
raises to the caller, runs quietly, and skips songs that already have lyrics.
"""
from __future__ import annotations

import html
import json
import os
import re
import subprocess
import sys
import urllib.parse
import urllib.request

LYRICS_DIR = os.path.expanduser(
    os.environ.get("NCMPCPP_LYRICS_DIR", "~/.local/share/lyrics")
)
UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"
TIMEOUT = 12

# Content that means a previous (broken) scrape, so we should refetch:
# e.g. "2 Contributorsgestalt Lyrics"
BROKEN_RE = re.compile(r"^\s*\d+\s+Contributors?.*Lyrics\s*$", re.IGNORECASE | re.DOTALL)
MIN_VALID_LEN = 60  # real lyrics are always longer than this


def get(url: str) -> str:
    req = urllib.request.Request(url, headers={"User-Agent": UA})
    with urllib.request.urlopen(req, timeout=TIMEOUT) as r:
        return r.read().decode("utf-8", "replace")


def current_song():
    try:
        out = subprocess.run(
            ["mpc", "current", "-f", "%artist%\t%title%"],
            capture_output=True, text=True, timeout=5,
        ).stdout.strip()
    except Exception:
        return None
    if not out or "\t" not in out:
        return None
    artist, title = out.split("\t", 1)
    if not artist or not title:
        return None
    return artist, title


def lyrics_path(artist: str, title: str) -> str:
    # ncmpcpp stores lyrics flat as "<artist> - <title>.txt"; '/' is unsafe.
    name = f"{artist} - {title}.txt".replace("/", "_")
    return os.path.join(LYRICS_DIR, name)


def needs_fetch(path: str, force: bool) -> bool:
    if force or not os.path.exists(path):
        return True
    try:
        with open(path, encoding="utf-8") as f:
            content = f.read()
    except OSError:
        return True
    return len(content.strip()) < MIN_VALID_LEN or bool(BROKEN_RE.match(content))


def _norm(s: str) -> str:
    # Keep only alphanumerics (Unicode-aware: Korean stays), casefolded.
    return "".join(ch for ch in s.casefold() if ch.isalnum())


def _artist_matches(query_artist: str, hit_artist: str) -> bool:
    a, b = _norm(query_artist), _norm(hit_artist)
    if not a or not b:
        return False
    return a in b or b in a


def genius_url(artist: str, title: str):
    q = urllib.parse.quote(f"{artist} {title}")
    try:
        data = json.loads(get(f"https://genius.com/api/search/multi?q={q}"))
    except Exception:
        return None
    # Only accept a hit whose artist matches the song's artist. Genius search
    # happily returns unrelated songs when it lacks the track; writing those
    # would be worse than writing nothing.
    for section in data.get("response", {}).get("sections", []):
        for hit in section.get("hits", []):
            if hit.get("type") != "song":
                continue
            result = hit.get("result", {})
            hit_artist = result.get("primary_artist", {}).get("name", "")
            if _artist_matches(artist, hit_artist):
                return result.get("url")
    return None


def extract_lyrics(page: str) -> str:
    # Every verse lives in a data-lyrics-container; concatenate them all.
    parts = re.findall(
        r'data-lyrics-container="true"[^>]*>(.*?)</div>'
        r'(?=\s*<div data-lyrics-container|\s*<div[^>]*class="[^"]*RightSidebar'
        r'|\s*</div>\s*<div[^>]*StyledLink|$)',
        page, re.S,
    )
    if not parts:
        parts = re.findall(r'data-lyrics-container="true"[^>]*>(.*)', page, re.S)
    text = "".join(parts)
    # Drop the nested header block (Contributors / "<song> Lyrics") that broke ncmpcpp.
    text = re.sub(
        r'<div[^>]*data-exclude-from-selection="true".*?</div>\s*</div>\s*</div>',
        "", text, flags=re.S,
    )
    text = re.sub(r"<br\s*/?>", "\n", text)        # line breaks
    text = re.sub(r"<[^>]+>", "", text)            # strip remaining tags
    return html.unescape(text).strip()


def write_atomic(path: str, content: str) -> None:
    os.makedirs(os.path.dirname(path), exist_ok=True)
    tmp = path + ".tmp"
    with open(tmp, "w", encoding="utf-8") as f:
        f.write(content + "\n")
    os.replace(tmp, path)


def main(argv) -> int:
    force = "--force" in argv
    argv = [a for a in argv if a != "--force"]

    if len(argv) >= 2:
        artist, title = argv[0], argv[1]
    else:
        song = current_song()
        if not song:
            return 0  # nothing playing; nothing to do
        artist, title = song

    path = lyrics_path(artist, title)
    if not needs_fetch(path, force):
        return 0

    url = genius_url(artist, title)
    if not url:
        return 0
    try:
        lyrics = extract_lyrics(get(url))
    except Exception:
        return 0
    if len(lyrics) < MIN_VALID_LEN or BROKEN_RE.match(lyrics):
        return 0  # don't overwrite with garbage

    write_atomic(path, lyrics)
    return 0


if __name__ == "__main__":
    try:
        sys.exit(main(sys.argv[1:]))
    except Exception:
        sys.exit(0)  # never disrupt ncmpcpp's song-change command