-
Notifications
You must be signed in to change notification settings - Fork 2k
lastgenre: Genre spelling normalization (aliases) #6466
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
JOJ0
wants to merge
22
commits into
master
Choose a base branch
from
lastgenre_aliases
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
22 commits
Select commit
Hold shift + click to select a range
ca16d85
lastgenre: Test aliases and ship default config
JOJ0 9ed89de
lastgenre: Docs for genre normalization (aliases)
JOJ0 7b2d4b0
lastgenre: Docs hint on canonical without whitelist
JOJ0 b1cfe11
lastgenre: Implement genre alias normalization
JOJ0 5630b17
lastgenre: Slightly optimize filter_valid performance
JOJ0 04fe3e9
Fixes/additions to default aliases
JOJ0 2efb8bc
Fix normalize_genre docstring wording
JOJ0 56251ec
Test invalid alias template catch
JOJ0 95fdc90
Catch exception on invalid alias template
JOJ0 0e979d4
fix test aliases
JOJ0 d60ec41
lastgenre: Dedup ignore/alias regex compilation
JOJ0 a65d0b3
Streamline in test alias type with ignore naming
JOJ0 f56dc28
Streamline alias type with ignore naming
JOJ0 271a23f
lastgenre: Fix ignorelist tests naming inconsistencies
JOJ0 1f48c92
Add alt rock to default aliases test
JOJ0 942221a
Finalize default aliases and fix genres,genres-tree
JOJ0 f813e95
Restructure and reduce alias tests
JOJ0 9359a7d
Reword comment on config bool/mapping handling
JOJ0 0e27872
Fixes to default aliases.yaml
JOJ0 dfa54b5
Hypenate post rock in default aliases
JOJ0 1123400
Remove redundant hypens for some default aliases
JOJ0 0eaca9d
Remove redundant slashes for some default aliases
JOJ0 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,7 +25,6 @@ | |
| from __future__ import annotations | ||
|
|
||
| import os | ||
| import re | ||
| from collections import defaultdict | ||
| from functools import singledispatchmethod | ||
| from pathlib import Path | ||
|
|
@@ -37,7 +36,11 @@ | |
| from beets import config, library, plugins, ui | ||
| from beets.library import Album, Item | ||
| from beets.util import plurality, unique_list | ||
| from beetsplug.lastgenre.utils import drop_ignored_genres, is_ignored | ||
| from beetsplug.lastgenre.utils import ( | ||
| compile_pattern, | ||
| is_ignored, | ||
| normalize_genre, | ||
| ) | ||
|
|
||
| from .client import LastFmClient | ||
|
|
||
|
|
@@ -48,7 +51,7 @@ | |
| from beets.importer import ImportSession, ImportTask | ||
| from beets.library import LibModel | ||
|
|
||
| from .utils import GenreIgnorePatterns | ||
| from .utils import GenreAliasPatterns, GenreIgnorePatterns | ||
|
|
||
| Whitelist = set[str] | ||
| """Set of valid genre names (lowercase). Empty set means all genres allowed.""" | ||
|
|
@@ -115,6 +118,7 @@ def sort_by_depth(tags: list[str], branches: CanonTree) -> list[str]: | |
|
|
||
| WHITELIST = os.path.join(os.path.dirname(__file__), "genres.txt") | ||
| C14N_TREE = os.path.join(os.path.dirname(__file__), "genres-tree.yaml") | ||
| ALIASES = os.path.join(os.path.dirname(__file__), "aliases.yaml") | ||
|
|
||
|
|
||
| class LastGenrePlugin(plugins.BeetsPlugin): | ||
|
|
@@ -137,6 +141,7 @@ def __init__(self) -> None: | |
| "title_case": True, | ||
| "pretend": False, | ||
| "ignorelist": {}, | ||
| "aliases": True, | ||
| } | ||
| ) | ||
| self.setup() | ||
|
|
@@ -150,8 +155,12 @@ def setup(self) -> None: | |
| self.c14n_branches: CanonTree | ||
| self.c14n_branches, self.canonicalize = self._load_c14n_tree() | ||
| self.ignore_patterns: GenreIgnorePatterns = self._load_ignorelist() | ||
| self.alias_patterns: GenreAliasPatterns = self._load_aliases() | ||
| self.client = LastFmClient( | ||
| self._log, self.config["min_weight"].get(int), self.ignore_patterns | ||
| self._log, | ||
| self.config["min_weight"].get(int), | ||
| self.ignore_patterns, | ||
| self.alias_patterns, | ||
| ) | ||
|
|
||
| def _load_whitelist(self) -> Whitelist: | ||
|
|
@@ -228,24 +237,68 @@ def _load_ignorelist(self) -> GenreIgnorePatterns: | |
|
|
||
| compiled_ignorelist: GenreIgnorePatterns = defaultdict(list) | ||
| for artist, patterns in raw_ignorelist.items(): | ||
| artist_patterns = [] | ||
| for pattern in patterns: | ||
| try: | ||
| artist_patterns.append(re.compile(pattern, re.IGNORECASE)) | ||
| except re.error: | ||
| artist_patterns.append( | ||
| re.compile(re.escape(pattern), re.IGNORECASE) | ||
| ) | ||
| artist_patterns = [compile_pattern(p) for p in patterns] | ||
| self._log.extra_debug( | ||
| "ignore for {}: {}", | ||
| artist, | ||
| [p.pattern for p in artist_patterns], | ||
| ) | ||
|
|
||
| compiled_ignorelist[artist] = artist_patterns | ||
| compiled_ignorelist[artist.lower()] = artist_patterns | ||
|
|
||
| return compiled_ignorelist | ||
|
|
||
| def _load_aliases(self) -> GenreAliasPatterns: | ||
| """Load the genre alias table from the beets config. | ||
|
|
||
| Reads ``lastgenre.aliases`` as a mapping of genre names to lists of | ||
| regex patterns:: | ||
|
|
||
| lastgenre: | ||
| aliases: | ||
| drum and bass: | ||
| - d(rum)?[ &n/]*b(ass)? | ||
| \\g<1> hop: | ||
| - (glitch|hip|jazz|trip)y?[ /-]*hop | ||
|
|
||
| The key (genre name) is used as a ``re.Match.expand()`` template, | ||
| so ``\\g<N>`` back-references to capture groups are supported. | ||
|
|
||
| Setting ``aliases: true`` (the default) loads the bundled | ||
| ``aliases.yaml`` file. Setting ``aliases: false`` disables | ||
| normalization entirely. | ||
|
|
||
| Raises: | ||
| confuse.ConfigTypeError: when the config value is not a mapping | ||
| or a list entry is not a string. | ||
| """ | ||
| aliases_raw = self.config["aliases"].get() | ||
| if aliases_raw is False: | ||
| return [] | ||
| if aliases_raw in (True, "", None): | ||
| self._log.debug("Loading default aliases from {}", ALIASES) | ||
| with Path(ALIASES).open(encoding="utf-8") as f: | ||
| aliases_dict = yaml.safe_load(f) | ||
| if not aliases_dict: | ||
| return [] | ||
| else: | ||
| # aliases defaults to True (unlike ignorelist), so MappingValues | ||
| # would raise on the boolean default layer. | ||
| aliases_cfg = confuse.Configuration("lastgenre_aliases", read=False) | ||
| aliases_cfg.set({"aliases": aliases_raw}) | ||
| aliases_dict = aliases_cfg["aliases"].get( | ||
| confuse.MappingValues(confuse.Sequence(str)) | ||
| ) | ||
|
|
||
| entries: GenreAliasPatterns = [] | ||
| for canonical, patterns in aliases_dict.items(): | ||
| template = str(canonical).lower() | ||
| for raw_pat in patterns: | ||
| entries.append((compile_pattern(str(raw_pat)), template)) | ||
|
|
||
| self._log.extra_debug("Loaded {} alias entries", len(entries)) | ||
| return entries | ||
|
|
||
| @property | ||
| def sources(self) -> tuple[str, ...]: | ||
| """A tuple of allowed genre sources. May contain 'track', | ||
|
|
@@ -267,6 +320,8 @@ def _resolve_genres( | |
| """Canonicalize, sort and filter a list of genres. | ||
|
|
||
| - Returns an empty list if the input tags list is empty. | ||
| - If aliases are configured, variant spellings are normalised first | ||
| (e.g. 'hip-hop' → 'hip hop', 'dnb' → 'drum and bass'). | ||
| - If canonicalization is enabled, it extends the list by incorporating | ||
| parent genres from the canonicalization tree. When a whitelist is set, | ||
| only parent tags that pass the whitelist filter are included; | ||
|
|
@@ -286,6 +341,13 @@ def _resolve_genres( | |
| if not tags: | ||
| return [] | ||
|
|
||
| # Normalize variant spellings before any other processing. | ||
| if self.alias_patterns: | ||
| tags = [ | ||
| normalize_genre(self._log, self.alias_patterns, tag) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are we normalizing genres twice: once in the client and once here? |
||
| for tag in tags | ||
| ] | ||
|
|
||
| count = self.config["count"].get(int) | ||
|
|
||
| # Canonicalization (if enabled) | ||
|
|
@@ -353,14 +415,19 @@ def _filter_valid( | |
| if not self.whitelist and not self.ignore_patterns: | ||
| return cleaned | ||
|
|
||
| whitelisted = [ | ||
| g | ||
| for g in cleaned | ||
| if not self.whitelist or g.lower() in self.whitelist | ||
| ] | ||
| return drop_ignored_genres( | ||
| self._log, self.ignore_patterns, whitelisted, artist | ||
| ) | ||
| result = [] | ||
| for genre in cleaned: | ||
| if self.whitelist and genre.lower() not in self.whitelist: | ||
| continue | ||
|
|
||
| if self.ignore_patterns and is_ignored( | ||
| self._log, self.ignore_patterns, genre, artist | ||
| ): | ||
| continue | ||
|
|
||
| result.append(genre) | ||
|
|
||
| return result | ||
|
|
||
| # Genre resolution pipeline. | ||
|
|
||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,136 @@ | ||
| # Default genre aliases for the lastgenre plugin. | ||
| # | ||
| # Keys are canonical names and support \g<N> back-references to pattern groups. | ||
| # Patterns are case-insensitive full-matches. Order matters: first match wins. | ||
|
|
||
| # --------------------------------------------------------------------------- | ||
| # Ampersands / Delimiters | ||
| # --------------------------------------------------------------------------- | ||
|
|
||
| # drum and bass (d&b, dnb, drum n bass, ...) | ||
| drum and bass: | ||
| - d(rum)?[ &n/]*b(ass)? | ||
|
|
||
| # rhythm and blues (r&b, rnb, rhythm/blues, ...) | ||
| rhythm and blues: | ||
| - r(hythm)?[ &n/]*b(lues)? | ||
|
|
||
| # rock and roll (rock & roll, rock'n'roll, rock-n-roll, ...) | ||
| rock and roll: | ||
| - rock[ '‐&n/ \-]*roll | ||
|
|
||
| # --------------------------------------------------------------------------- | ||
| # Consistent Delimiters (Hyphenation) | ||
| # --------------------------------------------------------------------------- | ||
|
|
||
|
|
||
| # Hyphenate j-pop, k-pop, c-pop, etc. | ||
| # Matches: kpop, k pop -> k-pop; j rock -> j-rock; | ||
| # | ||
| \g<1>-\g<2>: | ||
| - (c|k|j) *(folk|goth|pop|rock|ska|trance) | ||
|
|
||
| # post-rock, post-punk, post-metal, etc. (post rock -> post-rock) | ||
| post-\g<1>: | ||
| - post +(\w+) | ||
|
|
||
| # lo-fi, glo-fi (lofi, lo fi -> lo-fi) | ||
| lo-fi: | ||
| - (g?lo) *fi | ||
|
|
||
| # p-funk, g-funk, etc. (p funk -> p-funk) | ||
| \g<1>-funk: | ||
| - (p|g) *funk | ||
|
|
||
| # synthpop, synthwave, etc. (synth pop -> synthpop) | ||
| synth\g<1>: | ||
| - synth[ -]+(\w+) | ||
|
|
||
| # avant-garde (avantgarde, avant gard, avant-gard) | ||
| avant-garde: | ||
| - avant *(gard(e)?)? | ||
| - avant-gard | ||
| - avant | ||
|
|
||
| # --------------------------------------------------------------------------- | ||
| # Nu- Genre Spelling (nu jazz, nu-disco, etc.) | ||
| # --------------------------------------------------------------------------- | ||
|
|
||
| # Matches: nu-jazz -> nu jazz; nu disco -> nu disco | ||
| # Note: 'nu-disco' is hyphenated in the tree but 'nu jazz' isn't in genres.txt | ||
| nu \g<1>: | ||
| - nu[ -]*(disco|jazz|metal|soul) | ||
|
|
||
| # --------------------------------------------------------------------------- | ||
| # Terminology / Synonym / Translation fixes | ||
| # --------------------------------------------------------------------------- | ||
|
|
||
| # electronic (electronic music, elektronika) | ||
| electronic: | ||
| - electronic music | ||
|
|
||
| # world music (world) | ||
| world music: | ||
| - world | ||
|
|
||
| # chillout (chill, chill out, chill-out) | ||
| chillout: | ||
| - chill([ -]*out)? | ||
|
|
||
| # darkwave (dark wave) | ||
| darkwave: | ||
| - dark[ -]*wave | ||
|
|
||
| # downtempo (downbeat) | ||
| downtempo: | ||
| - down[ -]*beat | ||
|
|
||
| # shoegaze (shoegazer, shoegazing) | ||
| shoegaze: | ||
| - shoegaze?r? | ||
| - shoegazing | ||
|
|
||
| # --------------------------------------------------------------------------- | ||
| # Form Fixes (Hip Hop, Trip Hop, etc.) | ||
| # --------------------------------------------------------------------------- | ||
|
|
||
| # Normalized spacing: hip-hop, hiphop -> hip hop | ||
| \g<1> hop: | ||
| - (glitch|hip|jazz|trip)y?([ -]*hip)?[ -]*hop | ||
|
|
||
| # --------------------------------------------------------------------------- | ||
| # Abbreviations & International Spellings | ||
| # --------------------------------------------------------------------------- | ||
|
|
||
| # blues rock (blues-rock) | ||
| blues rock: | ||
| - blues[ -]*rock | ||
|
|
||
| # folk rock (folk-rock) | ||
| folk rock: | ||
| - folk[ -]*rock | ||
|
|
||
| # alternative rock (alt, alternative, alt rock, alternative rock, ...) | ||
| alternative rock: | ||
| - alt([ -]*rock)? | ||
| - alternative([ -]*rock)? | ||
|
|
||
| # indie rock (indie, indie rock) | ||
| indie rock: | ||
| - indie([ -]*rock)? | ||
|
|
||
| # gothic rock (goth, goth rock) - doesn't catch gothic metal | ||
| gothic rock: | ||
| - goth(?!ic)([ -]*rock)? | ||
| - gothic[ -]*rock | ||
|
|
||
| # progressive rock (prog, prog rock, progressive rock) | ||
| # Note: mapping standalone 'progressive' is avoided to prevent catching 'progressive metal', etc. | ||
| progressive rock: | ||
| - prog([ -]*rock)? | ||
| - progressive[ -]*rock | ||
|
|
||
| # traditional folk (trad, traditional) | ||
| # Note: avoids matching 'trad jazz' or 'traditional country'. | ||
| traditional folk: | ||
| - trad(/|ition(/|al)?)?-? |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why not setting these aliases in the default plugin configuration?