stuff

2024-12-22 06:22:41 +00:00 · 2023-11-15 22:45:51 +01:00 · 2023-11-15 22:45:51 +01:00 · f35f1e1e6c
commit f35f1e1e6c
parent a76abef429
5 changed files with 809 additions and 605 deletions
--- a/README.md
+++ b/README.md
@ -2,32 +2,32 @@

 A batch downloader for Soulseek using Soulseek.NET. Accepts CSV files and Spotify or YouTube urls.

-##### Download tracks from a csv file:
+#### Download tracks from a csv file:
 ```
 slsk-batchdl -i test.csv
 ```  
 Use `--print tracks` before downloading to check if everything has been parsed correctly. The names of the columns should be: `Artist`, `Title`, `Album`, `Length`. Only the title column is required, but any additional info improves search.

-##### Download spotify likes while skipping existing songs and creating an m3u file:
+#### Download spotify likes while skipping existing songs:
 ```
-slsk-batchdl -i spotify-likes --m3u --skip-existing
+slsk-batchdl -i spotify-likes --skip-existing
 ```
-You might need to provide an id and secret when using spotify (e.g when downloading a private playlist), which you can get here https://developer.spotify.com/dashboard/applications. Create an app, then select it and add `http://localhost:48721/callback` as a redirect url in the settings.  
+To download private playlists or liked songs you will need to provide a client id and secret, which you can get here https://developer.spotify.com/dashboard/applications. Create an app and add `http://localhost:48721/callback` as a redirect url in its settings.  
  
-##### Download the first 10 songs of a youtube playlist:
+#### Download youtube playlist (with fallback to yt-dlp), including deleted videos:
 ```
-slsk-batchdl -n 10 -i "https://www.youtube.com/playlist?list=PLI_eFW8NAFzYAXZ5DrU6E6mQ_XfhaLBUX"
+slsk-batchdl --get-deleted --yt-dlp -i "https://www.youtube.com/playlist?list=PLI_eFW8NAFzYAXZ5DrU6E6mQ_XfhaLBUX"
 ```
-To include unavailable videos, you will need to provide an api key with `--youtube-key`. Get it here https://console.cloud.google.com. Create a new project, click "Enable Api" and search for "youtube data", then follow the prompts.  
+Playlists are retrieved using the YoutubeExplode library which unfortunately doesn't always return all videos. You can use the official API by providing a key with `--youtube-key`. Get it here https://console.cloud.google.com. Create a new project, click "Enable Api" and search for "youtube data", then follow the prompts.  

-##### Search & download a specific song, preferring high quality:
+#### Search & download a specific song:
 ```
 slsk-batchdl -i "title=MC MENTAL @ HIS BEST,length=242" --pref-format "flac,wav"
 ```  
  
-##### Find an artist's songs which aren't in your library:
+#### Find an artist's songs which aren't in your library:
 ```
-slsk-batchdl -i "artist=MC MENTAL" -a --print tracks --skip-existing --music-dir "path\to\music"
+slsk-batchdl -i "artist=MC MENTAL" --aggregate --print tracks --skip-existing --music-dir "path\to\music"
 ```

 ### Options:
@ -48,47 +48,43 @@ Usage: slsk-batchdl -i <input> [OPTIONS]
                                 Title, Album, Length. Only the title column is required, but
                                 any extra info improves search results.

-                                 String for the track, album, or artist to search for:
-                                 Can either be any typical search text like "Artist - Title"
-                                 or a comma-separated list like "title=Song,artist=Artist"
-                                 Available fields: title, artist, album, length (in seconds).
+                                 Name of the track, album, or artist to search for:
+                                 Can either be any typical search string or a comma-separated
+                                 list like "title=Song Name,artist=Artist Name,length=215"
+                                 Allowed properties are: title, artist, album, length (sec)

 Options:
  --user <username>              Soulseek username
  --pass <password>              Soulseek password

-  --spotify                      Input is a spotify url (override automatic parsing)
-  --spotify-id <id>              spotify client ID (required for private playlists)
-  --spotify-secret <secret>      spotify client secret (required for private playlists)
+  --spotify-id <id>              spotify client ID
+  --spotify-secret <secret>      spotify client secret

-  --youtube                      Input is a youtube url (override automatic parsing)
  --youtube-key <key>            Youtube data API key
+  --get-deleted                  Attempt to retrieve titles of deleted videos from wayback
+                                 machine. Requires yt-dlp.

-  --csv                          Input is a path to a local CSV (override automatic parsing)
  --time-format <format>         Time format in Length column of the csv file (e.g h:m:s.ms
-                                 for durations like 1:04:35.123). Default: s (seconds)
+                                 for durations like 1:04:35.123). Default: s
  --yt-parse                     Enable if the csv file contains YouTube video titles and
-                                 channel names; attempt to parse them into proper title and
-                                 artist. If the the csv contains an "ID", "URL", or
-                                 "Description" column then those will be used for parsing as
-                                 well.
+                                 channel names; attempt to parse them into title and artist
+                                 names.

-  --string                       Input is a search string (override automatic parsing)
-  -a --aggregate                 Instead of downloading a single track matching the search
-                                 string, find and download all distinct songs associated with
-                                 the provided artist, album, or track title. Search string must
-                                 be a list of properties.
- --min-users-aggregate <num>     Minimum number of users sharing a track before it is
-                                 downloaded in aggregate mode. Setting it to 2 or more will
-                                 significantly reduce false positives, but may introduce false
-                                 negatives. Default: 1
+  -a --aggregate                 When input is a string: Instead of downloading a single
+                                 track matching the search string, find and download all
+                                 distinct songs associated with the provided artist, album,
+                                 or track title. Input string must be a list of properties.
+  --min-users-aggregate <num>    Minimum number of users sharing a track before it is
+                                 downloaded in aggregate mode. Setting it to higher values
+                                 will significantly reduce false positives, but may introduce
+                                 false negatives. Default: 2

-  -p --path <path>               Where to place downloaded files
-  -f --folder <name>             Subfolder name
+  -p --path <path>               Download folder
+  -f --folder <name>             Subfolder name (default: playlist/csv name)
  -n --number <maxtracks>        Download the first n tracks of a playlist
  -o --offset <offset>           Skip a specified number of tracks
  --reverse                      Download tracks in reverse order
-  --remove-from-playlist         Remove downloaded tracks from playlist (spotify only)
+  --remove-from-playlist         Remove downloaded tracks from playlist (for spotify only)
  --name-format <format>         Name format for downloaded tracks, e.g "{artist} - {title}"
  --m3u                          Create an m3u8 playlist file

@ -102,54 +98,50 @@ Options:
  --banned-users <list>          Comma-separated list of users to ignore
  --danger-words <list>          Comma-separated list of words that must appear in either
                                 both search result and track title or in neither of the
-                                 two. Case-insensitive. (default:"mix, edit, dj, cover")
+                                 two. Case-insensitive. (default:"remix, edit,cover")
  --pref-format <format>         Preferred file format(s), comma-separated (default: mp3)
-  --pref-length-tol <tol>        Preferred length tolerance in seconds (default: 3)
+  --pref-length-tol <tol>        Preferred length tolerance in seconds (default: 2)
  --pref-min-bitrate <rate>      Preferred minimum bitrate (default: 200)
  --pref-max-bitrate <rate>      Preferred maximum bitrate (default: 2200)
  --pref-max-samplerate <rate>   Preferred maximum sample rate (default: 96000)
-  --pref-strict-title            Prefer download if filename contains track title
  --pref-strict-artist           Prefer download if filepath contains track artist
  --pref-banned-users <list>     Comma-separated list of users to deprioritize
  --pref-danger-words <list>     Comma-separated list of words that should appear in either
                                 both search result and track title or in neither of the
-                                 two.
+                                 two. (default: see github)

  -s --skip-existing             Skip if a track matching file conditions is found in the
                                 output folder or your music library (if provided)
-  --skip-mode <mode>             Sets the way the program checks if a track exists
-                                 name: Use only filenames
+  --skip-mode <mode>             name: Use only filenames to check if a track exists
                                 name-precise (default): Use filenames and check conditions
                                 tag: Use file tags (slower)
                                 tag-precise: Use file tags and check file conditions
  --music-dir <path>             Specify to skip downloading tracks found in a music library
-                                 use with --skip-existing
+                                 Use with --skip-existing
  --skip-not-found               Skip searching for tracks that weren't found on Soulseek
                                 during the last run.
  --remove-ft                    Remove "ft." or "feat." and everything after from the
-                                 track names before searching.
-  --remove-brackets              Remove text in square brackets from track names before
-                                 searching.
+                                 track names before searching
+  --remove-regex <regex>         Remove a regex from all track names and artist names
  --no-artist-search             Perform a search without artist name if nothing was
                                 found. Only use for sources such as youtube or soundcloud
                                 where the "artist" could just be an uploader.
  --artist-search                Also try to find track by searching for the artist only
-  --no-regex-search <reg>        Perform an additional search without a regex pattern
-  --no-diacr-search              Perform an additional search without diacritics
-  -d --desperate                 Equivalent to enabling all additional searches, slower.
+  --no-diacr-search              Also perform a search without diacritics
+  --no-regex-search <regex>      Also perform a search without a regex pattern
  --yt-dlp                       Use yt-dlp to download tracks that weren't found on
                                 Soulseek. yt-dlp must be available from the command line.

  --config <path>                Specify config file location
  --search-timeout <ms>          Max search time in ms (default: 6000)
  --max-stale-time <ms>          Max download time without progress in ms (default: 50000)
-  --concurrent-processes <num>   Max concurrent searches & downloads (default: 2)
-  --display <option>             Changes how searches and downloads are displayed.
-                                 single (default): Show transfer state and percentage.
-                                 double: Also show a progress bar.
-                                 simple: Only printing
+  --concurrent-downloads <num>   Max concurrent searches & downloads (default: 2)
+  --display <option>             Changes how searches and downloads are displayed:
+                                 single (default): Show transfer state and percentage
+                                 double: Transfer state and a large progress bar
+                                 simple: No download bars or changing percentages

-  --print <option>               Only print tracks or results instead of downloading.
+  --print <option>               Print tracks or search results instead of downloading:
                                 tracks: Print all tracks to be downloaded
                                 tracks-full: Print extended information about all tracks
                                 results: Print search results satisfying file conditions
--- a/slsk-batchdl/Program.cs
+++ b/slsk-batchdl/Program.cs
--- a/slsk-batchdl/Spotify.cs
+++ b/slsk-batchdl/Spotify.cs
@ -85,7 +85,7 @@ public class Spotify
    public async Task<List<Track>> GetLikes(int max = int.MaxValue, int offset = 0)
    {
        if (!loggedIn)
-            throw new Exception("Can't get liked music, not logged in");
+            throw new Exception("Can't get liked music as user is not logged in");

        List<Track> res = new List<Track>();
        int limit = Math.Min(max, 50);
--- a/slsk-batchdl/YouTube.cs
+++ b/slsk-batchdl/YouTube.cs
@ -3,7 +3,12 @@ using Google.Apis.Services;
 using System.Xml;
 using YoutubeExplode;
 using System.Text.RegularExpressions;
-
+using YoutubeExplode.Common;
+using System.Diagnostics;
+using HtmlAgilityPack;
+using System.Text;
+using System.Threading.Channels;
+using System.Collections.Concurrent;

 public static class YouTube
 {
@ -67,10 +72,10 @@ public static class YouTube
                    break;
            }

-            if (tracksDict.Count >= 200)
+            if (tracksDict.Count >= 200 && !Console.IsOutputRedirected)
            {
                Console.SetCursorPosition(0, Console.CursorTop);
-                Console.Write(tracks.Count);
+                Console.Write($"Loaded: {tracks.Count}");
            }

            playlistItemsRequest.PageToken = playlistItemsResponse.NextPageToken;
@ -93,30 +98,6 @@ public static class YouTube

        title = title.Replace("–", "-");

-        var stringsToRemove = new string[] { "(Official music video)", "(Official video)", "(Official audio)",
-                    "(Lyrics)", "(Official)", "(Lyric Video)", "(Official Lyric Video)", "(Official HD Video)",
-                    "(Official 4K Video)", "(Video)", "[HD]", "[4K]", "(Original Mix)", "(Lyric)", "(Music Video)", 
-                    "(Visualizer)", "(Audio)", "Official Lyrics" };
-
-        foreach (string s in stringsToRemove)
-        {
-            var t = title;
-            title = Regex.Replace(title, Regex.Escape(s), "", RegexOptions.IgnoreCase);
-            if (t == title)
-            {
-                if (s.Contains("["))
-                {
-                    string s2 = s.Replace("[", "(").Replace("]", ")");
-                    title = Regex.Replace(title, Regex.Escape(s2), "", RegexOptions.IgnoreCase);
-                }
-                else if (s.Contains("("))
-                {
-                    string s2 = s.Replace("(", "[").Replace(")", "]");
-                    title = Regex.Replace(title, Regex.Escape(s2), "", RegexOptions.IgnoreCase);
-                }
-            }
-        }
-
        var trackTitle = title.Trim();
        trackTitle = Regex.Replace(trackTitle, @"\s+", " ");
        var artist = uploader.Trim();
@ -297,4 +278,159 @@ public static class YouTube
        var playlist = await youtube.Playlists.GetAsync(url);
        return playlist.Id.ToString();
    }
+
+    public class YouTubeArchiveRetriever
+    {
+        private HttpClient _client;
+
+        public YouTubeArchiveRetriever()
+        {
+            _client = new HttpClient();
+            _client.Timeout = TimeSpan.FromSeconds(10);
+        }
+
+        public async Task<List<Track>> RetrieveDeleted(string url)
+        {
+            var deletedVideoUrls = new BlockingCollection<string>();
+            var tracks = new ConcurrentBag<Track>();
+
+            var process = new Process()
+            {
+                StartInfo = new ProcessStartInfo
+                {
+                    FileName = "yt-dlp",
+                    Arguments = $"--ignore-no-formats-error --no-warn --match-filter \"!uploader\" --print webpage_url {url}",
+                    RedirectStandardOutput = true,
+                    UseShellExecute = false,
+                    CreateNoWindow = true,
+                }
+            };
+            process.EnableRaisingEvents = true;
+            bool ok = false;
+            process.OutputDataReceived += (sender, e) =>
+            {
+                if (!ok) { Console.WriteLine("Got first video"); ok = true; }
+                deletedVideoUrls.Add(e.Data);
+            };
+            process.Exited += (sender, e) =>
+            {
+                deletedVideoUrls.CompleteAdding();
+            };
+
+            process.Start();
+            process.BeginOutputReadLine();
+
+            List<Task> workers = new List<Task>();
+            int workerCount = 4;
+            for (int i = 0; i < workerCount; i++)
+            {
+                workers.Add(Task.Run(async () =>
+                {
+                    foreach (var videoUrl in deletedVideoUrls.GetConsumingEnumerable())
+                    {
+                        var waybackUrl = await GetOldestArchiveUrl(videoUrl);
+                        if (!string.IsNullOrEmpty(waybackUrl))
+                        {
+                            var x = await GetVideoDetails(waybackUrl);
+                            if (!string.IsNullOrEmpty(x.title))
+                            {
+                                var track = await ParseTrackInfo(x.title, x.uploader, waybackUrl, x.duration, false);
+                                tracks.Add(track);
+                                if (!Console.IsOutputRedirected)
+                                {
+                                    Console.SetCursorPosition(0, Console.CursorTop);
+                                    Console.Write($"Deleted videos processed: {tracks.Count}");
+                                }
+                            }
+                        }
+                    }
+                }));
+            }
+
+            await Task.WhenAll(workers);
+            process.WaitForExit();
+            deletedVideoUrls.CompleteAdding();
+            Console.WriteLine();
+            return tracks.ToList();
+        }
+
+        private async Task<string> GetOldestArchiveUrl(string url)
+        {
+            var url2 = $"http://web.archive.org/cdx/search/cdx?url={url}&fl=timestamp,original&filter=statuscode:200&sort=timestamp:asc&limit=1";
+            HttpResponseMessage response = null;
+            for (int i = 0; i < 3; i++)
+            {
+                try {
+                    response = await _client.GetAsync(url2);
+                    break;
+                }
+                catch (Exception e) { }
+            }
+            if (response == null) return null;
+            
+            if (response.IsSuccessStatusCode)
+            {
+                var content = await response.Content.ReadAsStringAsync();
+                var lines = content.Split("\n").Where(line => !string.IsNullOrWhiteSpace(line)).ToList();
+                if (lines.Any())
+                {
+                    var parts = lines[0].Split(" ");
+                    var timestamp = parts[0];
+                    var originalUrl = parts[1];
+                    var oldestArchive = $"http://web.archive.org/web/{timestamp}/{originalUrl}";
+                    return oldestArchive;
+                }
+            }
+            return null;
+        }
+
+        public async Task<(string title, string uploader, int duration)> GetVideoDetails(string url)
+        {
+            var web = new HtmlWeb();
+            var doc = await web.LoadFromWebAsync(url);
+
+            var titlePatterns = new[]
+            {
+                "//h1[@id='video_title']",
+                "//meta[@name='title']",
+            };
+
+            var usernamePatterns = new[]
+            {
+                "//div[@id='userInfoDiv']/b/a",
+                "//a[contains(@class, 'contributor')]",
+                "//a[@id='watch-username']",
+                "//a[contains(@class, 'author')]",
+                "//div[@class='yt-user-info']/a",
+                "//div[@id='upload-info']//yt-formatted-string/a",
+                "//span[@itemprop='author']//link[@itemprop='name']",
+                "//a[contains(@class, 'yt-user-name')]",
+            };
+
+            string getItem(string[] patterns)
+            {
+                foreach (var pattern in patterns)
+                {
+                    var node = doc.DocumentNode.SelectSingleNode(pattern);
+                    var res = "";
+                    if (node != null)
+                    {
+                        if (pattern.StartsWith("//meta") || pattern.Contains("@itemprop"))
+                            res = node.GetAttributeValue("content", "");
+                        else
+                            res = node.InnerText;
+                        if (!string.IsNullOrEmpty(res)) return res;
+                    }
+                }
+                return "";
+            }
+
+            int duration = -1;
+            var node = doc.DocumentNode.SelectSingleNode("//meta[@itemprop='duration']");
+            if (node != null)
+                duration = (int)XmlConvert.ToTimeSpan(node.GetAttributeValue("content", "")).TotalSeconds;
+                
+            return (getItem(titlePatterns), getItem(usernamePatterns), duration);
+        }
+    }
 }
--- a/slsk-batchdl/slsk-batchdl.csproj
+++ b/slsk-batchdl/slsk-batchdl.csproj
@ -19,12 +19,13 @@

  <ItemGroup>
    <PackageReference Include="Goblinfactory.ProgressBar" Version="1.0.0" />
-    <PackageReference Include="Google.Apis.YouTube.v3" Version="1.60.0.2945" />
-    <PackageReference Include="Soulseek" Version="6.1.1" />
-    <PackageReference Include="SpotifyAPI.Web" Version="7.0.0" />
-    <PackageReference Include="SpotifyAPI.Web.Auth" Version="7.0.0" />
+    <PackageReference Include="Google.Apis.YouTube.v3" Version="1.63.0.3205" />
+    <PackageReference Include="HtmlAgilityPack" Version="1.11.54" />
+    <PackageReference Include="Soulseek" Version="6.1.3" />
+    <PackageReference Include="SpotifyAPI.Web" Version="7.0.2" />
+    <PackageReference Include="SpotifyAPI.Web.Auth" Version="7.0.2" />
    <PackageReference Include="TagLibSharp" Version="2.3.0" />
-    <PackageReference Include="YoutubeExplode" Version="6.2.12" />
+    <PackageReference Include="YoutubeExplode" Version="6.3.7" />
  </ItemGroup>

 </Project>