lyrics

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# lyrics is a Python 3 script to retrieve lyrics from azlyrics.com
# The basic usage is
#
#   lyrics [<args>] keyword...
#
# For more information on the arguments, run
#
#   lyrics --help
#
# lyrics can be used to retrieve lyrics for the currently playing song
# if your audio player is capable of printing the details of the current
# song on the standard output.
#
# * Example with keyword:
#
#   lyrics rebecca black friday
#
# * Example with MPD:
#
#   lyrics $(mpc --format '%artist% %title%' | head -n 1)
#
# * Example with DeaDBeeF:
#
#   lyrics $(deadbeef --nowplaying '%a %t' 2>/dev/null)
#
# Requires: Python 3 with BeautifulSoup >= 4.0 and Requests
#

import argparse
import os
import re
import requests
import subprocess
import sys
from bs4 import BeautifulSoup
from pydoc import pager
from unicodedata import normalize

# Use Google's "I'm Feeling Lucky" to "fuzzy" search for the right page.
SEARCH_URL = "https://www.google.com/search?hl=en&btnI&q=site:www.azlyrics.com"
REDIRECT_RE = r"href=['\"](https?://www.azlyrics.com/[^'\"]*)['\"]"
HEADERS = {"User-Agent": "Mozilla/5.0 (compatible)", "Referer": "www.google.com"}

CACHE_DIR = os.path.join(os.path.expanduser("~"), ".cache/lyrics")
if not os.path.exists(CACHE_DIR):
    os.makedirs(CACHE_DIR)

EDITOR = os.environ.get("EDITOR", "vi")

class PrintError(Exception):
    pass

def sanitize(string):
    """Sanitize string."""
    string = normalize("NFKD", string).encode("ASCII", "ignore")
    string = string.decode("ASCII")
    string = string.lower()
    string = re.sub(r"['\"]+", "", string)
    string = re.sub(r"[([{][^\]})]*[\]})]", "", string)
    string = re.sub(r"^\W+", "", string)
    string = re.sub(r"\W+$", "", string)
    string = re.sub(r"\W+", "+", string, flags=re.ASCII)

    return string

def get_lyrics(keyword):
    """Extract lyrics from HTML using BeautifulSoup."""
    keyword = sanitize(keyword)
    url = "+".join([SEARCH_URL, keyword, "lyrics"])
    req = requests.get(url, headers=HEADERS)

    # Google seems to be using JavaScript for redirection these days.
    if re.search(r"unauthorizedredirect", req.text):
        new_url = re.search(REDIRECT_RE, req.text).group(1)
        req = requests.get(new_url, headers=HEADERS)
        # requests seems to have trouble realizing that azlyrics.com
        # uses UTF-8 encoding, so specify it explicitly.
        req.encoding = "utf-8"

    # If lxml is available, prefer that.
    if "lxml" in sys.modules:
        soup = BeautifulSoup(req.text, features="lxml")
    else:
        soup = BeautifulSoup(req.text, features="html.parser")

    div = soup.find("div", attrs={"class": "col-xs-12 col-lg-8 text-center"})
    try:
        artist = div.find("div", attrs={"class": "lyricsh"}).find("h2").text
        artist = re.sub(r"(^\s*|\s*Lyrics\s*$)", "", artist)
        title = div.find("b", recursive=False).text
        title = re.sub(r'(^[\s"]*|[\s"]*$)', "", title)
    except AttributeError:
        raise PrintError("Error retrieving lyrics for the given keyword.\n" + req.url)

    lyrics = div.find_all("div", attrs={"class": None, "id": None})
    if lyrics is None:
        raise PrintError("Error extracting lyrics from HTML.\n" + req.url)

    lyrics = "".join([l.get_text().strip() for l in lyrics])

    return artist, title, lyrics, req.url

def main():
    """Argument parsing."""
    arg_parser = argparse.ArgumentParser(
        prog="lyrics",
        description="retrieve lyrics from azlyrics.com using keywords",
    )
    arg_parser.add_argument(
        "-f",
        "--force",
        action="store_true",
        default=False,
        help="force download lyrics (overwrites cache)",
    )
    group = arg_parser.add_mutually_exclusive_group()
    group.add_argument(
        "-e",
        "--edit",
        action="store_true",
        help="edit cached lyrics file using $EDITOR",
    )
    group.add_argument(
        "-c",
        "--cat",
        action="store_true",
        help="write to stdout instead of using pager",
    )
    arg_parser.add_argument("keyword", help="search keywords", nargs="+")
    args = arg_parser.parse_args()

    keyword = " ".join(args.keyword)
    cache_file = os.path.join(CACHE_DIR, sanitize(keyword) + ".txt")

    if args.edit:
        subprocess.call([EDITOR, cache_file])
        return

    if os.path.exists(cache_file) and not args.force:
        with open(cache_file, "r") as fd:
            text = fd.read()
    else:
        try:
            data = get_lyrics(keyword)
            text = "{} - {}\n---\n\n{}\n\n{}".format(*data)
            with open(cache_file, "w") as fd:
                fd.write(text)
        except PrintError as e:
            sys.exit(print(e, file=sys.stderr))

    if args.cat:
        print(text)
    else:
        pager(text)

if __name__ == "__main__":
    sys.exit(main())