diff options
Diffstat (limited to 'ar/.config/ncmpcpp/genius-lyrics.py')
| -rwxr-xr-x | ar/.config/ncmpcpp/genius-lyrics.py | 175 |
1 files changed, 175 insertions, 0 deletions
diff --git a/ar/.config/ncmpcpp/genius-lyrics.py b/ar/.config/ncmpcpp/genius-lyrics.py new file mode 100755 index 0000000..98a0292 --- /dev/null +++ b/ar/.config/ncmpcpp/genius-lyrics.py @@ -0,0 +1,175 @@ +#!/usr/bin/env python3 +"""Fetch lyrics from Genius for the current (or given) song and write them +in the flat `<artist> - <title>.txt` form that ncmpcpp reads. + +Why this exists: ncmpcpp 0.10.1's built-in Genius scraper stops at the first +</div> inside the lyrics container. Genius now nests a LyricsHeader div there, +so ncmpcpp only captures the header ("N Contributors<title> Lyrics") and drops +the actual lyrics. This script parses the current Genius structure correctly. + +Usage: + genius-lyrics.py # uses `mpc current` for artist/title + genius-lyrics.py "Artist" "Title" + genius-lyrics.py --force ... # refetch even if a good file already exists + +Designed to be safe to call from ncmpcpp's execute_on_song_change: it never +raises to the caller, runs quietly, and skips songs that already have lyrics. +""" +from __future__ import annotations + +import html +import json +import os +import re +import subprocess +import sys +import urllib.parse +import urllib.request + +LYRICS_DIR = os.path.expanduser( + os.environ.get("NCMPCPP_LYRICS_DIR", "~/.local/share/lyrics") +) +UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36" +TIMEOUT = 12 + +# Content that means a previous (broken) scrape, so we should refetch: +# e.g. "2 Contributorsgestalt Lyrics" +BROKEN_RE = re.compile(r"^\s*\d+\s+Contributors?.*Lyrics\s*$", re.IGNORECASE | re.DOTALL) +MIN_VALID_LEN = 60 # real lyrics are always longer than this + + +def get(url: str) -> str: + req = urllib.request.Request(url, headers={"User-Agent": UA}) + with urllib.request.urlopen(req, timeout=TIMEOUT) as r: + return r.read().decode("utf-8", "replace") + + +def current_song(): + try: + out = subprocess.run( + ["mpc", "current", "-f", "%artist%\t%title%"], + capture_output=True, text=True, timeout=5, + ).stdout.strip() + except Exception: + return None + if not out or "\t" not in out: + return None + artist, title = out.split("\t", 1) + if not artist or not title: + return None + return artist, title + + +def lyrics_path(artist: str, title: str) -> str: + # ncmpcpp stores lyrics flat as "<artist> - <title>.txt"; '/' is unsafe. + name = f"{artist} - {title}.txt".replace("/", "_") + return os.path.join(LYRICS_DIR, name) + + +def needs_fetch(path: str, force: bool) -> bool: + if force or not os.path.exists(path): + return True + try: + with open(path, encoding="utf-8") as f: + content = f.read() + except OSError: + return True + return len(content.strip()) < MIN_VALID_LEN or bool(BROKEN_RE.match(content)) + + +def _norm(s: str) -> str: + # Keep only alphanumerics (Unicode-aware: Korean stays), casefolded. + return "".join(ch for ch in s.casefold() if ch.isalnum()) + + +def _artist_matches(query_artist: str, hit_artist: str) -> bool: + a, b = _norm(query_artist), _norm(hit_artist) + if not a or not b: + return False + return a in b or b in a + + +def genius_url(artist: str, title: str): + q = urllib.parse.quote(f"{artist} {title}") + try: + data = json.loads(get(f"https://genius.com/api/search/multi?q={q}")) + except Exception: + return None + # Only accept a hit whose artist matches the song's artist. Genius search + # happily returns unrelated songs when it lacks the track; writing those + # would be worse than writing nothing. + for section in data.get("response", {}).get("sections", []): + for hit in section.get("hits", []): + if hit.get("type") != "song": + continue + result = hit.get("result", {}) + hit_artist = result.get("primary_artist", {}).get("name", "") + if _artist_matches(artist, hit_artist): + return result.get("url") + return None + + +def extract_lyrics(page: str) -> str: + # Every verse lives in a data-lyrics-container; concatenate them all. + parts = re.findall( + r'data-lyrics-container="true"[^>]*>(.*?)</div>' + r'(?=\s*<div data-lyrics-container|\s*<div[^>]*class="[^"]*RightSidebar' + r'|\s*</div>\s*<div[^>]*StyledLink|$)', + page, re.S, + ) + if not parts: + parts = re.findall(r'data-lyrics-container="true"[^>]*>(.*)', page, re.S) + text = "".join(parts) + # Drop the nested header block (Contributors / "<song> Lyrics") that broke ncmpcpp. + text = re.sub( + r'<div[^>]*data-exclude-from-selection="true".*?</div>\s*</div>\s*</div>', + "", text, flags=re.S, + ) + text = re.sub(r"<br\s*/?>", "\n", text) # line breaks + text = re.sub(r"<[^>]+>", "", text) # strip remaining tags + return html.unescape(text).strip() + + +def write_atomic(path: str, content: str) -> None: + os.makedirs(os.path.dirname(path), exist_ok=True) + tmp = path + ".tmp" + with open(tmp, "w", encoding="utf-8") as f: + f.write(content + "\n") + os.replace(tmp, path) + + +def main(argv) -> int: + force = "--force" in argv + argv = [a for a in argv if a != "--force"] + + if len(argv) >= 2: + artist, title = argv[0], argv[1] + else: + song = current_song() + if not song: + return 0 # nothing playing; nothing to do + artist, title = song + + path = lyrics_path(artist, title) + if not needs_fetch(path, force): + return 0 + + url = genius_url(artist, title) + if not url: + return 0 + try: + lyrics = extract_lyrics(get(url)) + except Exception: + return 0 + if len(lyrics) < MIN_VALID_LEN or BROKEN_RE.match(lyrics): + return 0 # don't overwrite with garbage + + write_atomic(path, lyrics) + return 0 + + +if __name__ == "__main__": + try: + sys.exit(main(sys.argv[1:])) + except Exception: + sys.exit(0) # never disrupt ncmpcpp's song-change command |
