From 11579385a9ddad22c8bfaba73840a245e2f42a45 Mon Sep 17 00:00:00 2001 From: fiso64 Date: Tue, 28 May 2024 14:02:03 +0200 Subject: [PATCH] commit --- README.md | 2 + slsk-batchdl/Program.cs | 43 +++++++---- slsk-batchdl/Utils.cs | 3 +- slsk-batchdl/YouTube.cs | 155 ++++++++++++++++++++++++++++++---------- 4 files changed, 153 insertions(+), 50 deletions(-) diff --git a/README.md b/README.md index e80b832..fbb0fd4 100644 --- a/README.md +++ b/README.md @@ -134,6 +134,8 @@ Options: --youtube-key Youtube data API key --get-deleted Attempt to retrieve titles of deleted videos from wayback machine. Requires yt-dlp. + --deleted-only Only retrieve & download deleted music. Combine with --print + tracks-full to display a list of all deleted titles & urls. --time-format Time format in Length column of the csv file (e.g h:m:s.ms for durations like 1:04:35.123). Default: s diff --git a/slsk-batchdl/Program.cs b/slsk-batchdl/Program.cs index d14fbb1..428e7bc 100644 --- a/slsk-batchdl/Program.cs +++ b/slsk-batchdl/Program.cs @@ -114,6 +114,7 @@ static class Program static bool useTagsCheckExisting = false; static bool removeTracksFromSource = false; static bool getDeleted = false; + static bool deletedOnly = false; static bool removeSingleCharacterSearchTerms = false; static int maxTracks = int.MaxValue; static int minUsersAggregate = 2; @@ -201,6 +202,8 @@ static class Program "\n --youtube-key Youtube data API key" + "\n --get-deleted Attempt to retrieve titles of deleted videos from wayback" + "\n machine. Requires yt-dlp." + + "\n --deleted-only Only retrieve & download deleted music. Combine with --print" + + "\n tracks-full to display a list of all deleted titles & urls." + "\n" + "\n --time-format Time format in Length column of the csv file (e.g h:m:s.ms" + "\n for durations like 1:04:35.123). Default: s" + @@ -221,7 +224,7 @@ static class Program "\n --pref-format Preferred file format(s), comma-separated (default: mp3)" + "\n --pref-length-tol Preferred length tolerance in seconds (default: 2)" + "\n --pref-min-bitrate Preferred minimum bitrate (default: 200)" + - "\n --pref-max-bitrate Preferred maximum bitrate (default: 2200)" + + "\n --pref-max-bitrate Preferred maximum bitrate (default: 2500)" + "\n --pref-min-samplerate Preferred minimum sample rate" + "\n --pref-max-samplerate Preferred maximum sample rate (default: 48000)" + "\n --pref-min-bitdepth Preferred minimum bit depth" + @@ -566,6 +569,11 @@ static class Program case "--get-deleted": getDeleted = true; break; + case "--do": + case "--deleted-only": + getDeleted = true; + deletedOnly = true; + break; case "--re": case "--regex": string s = args[++i].Replace("\\;", "<>"); @@ -992,23 +1000,30 @@ static class Program string name; List? deleted = null; - List tracks; + List tracks = new(); if (getDeleted) { Console.WriteLine("Getting deleted videos.."); var archive = new YouTube.YouTubeArchiveRetriever(); - deleted = await archive.RetrieveDeleted(ytUrl); + deleted = await archive.RetrieveDeleted(ytUrl, printFailed: deletedOnly); } - if (YouTube.apiKey != "") + if (!deletedOnly) { - Console.WriteLine("Loading YouTube playlist (API)"); - (name, tracks) = await YouTube.GetTracksApi(ytUrl, max, off); + if (YouTube.apiKey != "") + { + Console.WriteLine("Loading YouTube playlist (API)"); + (name, tracks) = await YouTube.GetTracksApi(ytUrl, max, off); + } + else + { + Console.WriteLine("Loading YouTube playlist"); + (name, tracks) = await YouTube.GetTracksYtExplode(ytUrl, max, off); + } } else { - Console.WriteLine("Loading YouTube playlist"); - (name, tracks) = await YouTube.GetTracksYtExplode(ytUrl, max, off); + name = await YouTube.GetPlaylistTitle(ytUrl); } if (deleted != null) { @@ -2582,7 +2597,7 @@ static class Program if (!noRemoveSpecialChars) { old = str; - str = str.ReplaceSpecialChars(" ").RemoveConsecutiveWs().Trim(); + str = str.ReplaceSpecialChars(" ").Trim().RemoveConsecutiveWs(); if (str == "") str = old; } foreach (var banned in bannedTerms) @@ -2602,7 +2617,7 @@ static class Program public static Track InferTrack(string filename, Track defaultTrack) { Track t = new Track(defaultTrack); - filename = GetFileNameWithoutExtSlsk(filename).Replace(" — ", " - ").Replace("_", " ").RemoveConsecutiveWs().Trim(); + filename = GetFileNameWithoutExtSlsk(filename).Replace(" — ", " - ").Replace("_", " ").Trim().RemoveConsecutiveWs(); var trackNumStart = new Regex(@"^(?:(?:[0-9][-\.])?\d{2,3}[. -]|\b\d\.\s|\b\d\s-\s)(?=.+\S)"); //var trackNumMiddle = new Regex(@"\s+-\s+(\d{2,3})(?: -|\.|)\s+|\s+-(\d{2,3})-\s+"); @@ -3131,11 +3146,11 @@ static class Program fname = fname.Replace("_", " ").ReplaceInvalidChars(" ", true, false); fname = regexRemove != "" ? Regex.Replace(fname, regexRemove, "") : fname; fname = diacrRemove ? fname.RemoveDiacritics() : fname; - fname = fname.Trim(); + fname = fname.Trim().RemoveConsecutiveWs(); tname = tname.Replace("_", " ").ReplaceInvalidChars(" ", true, false); tname = regexRemove != "" ? Regex.Replace(tname, regexRemove, "") : tname; tname = diacrRemove ? tname.RemoveDiacritics() : tname; - tname = tname.Trim(); + tname = tname.Trim().RemoveConsecutiveWs(); if (boundarySkipWs) return fname.ContainsWithBoundaryIgnoreWs(tname, ignoreCase, acceptLeftDigit: true); @@ -4073,6 +4088,8 @@ static class Program Console.WriteLine($" Album: {tracks[i].Album}"); if (!string.IsNullOrEmpty(tracks[i].URI)) Console.WriteLine($" URL/ID: {tracks[i].URI}"); + if (!string.IsNullOrEmpty(tracks[i].Other)) + Console.WriteLine($" Other: {tracks[i].Other}"); if (tracks[i].ArtistMaybeWrong) Console.WriteLine($" Artist maybe wrong: {tracks[i].ArtistMaybeWrong}"); if (tracks[i].Downloads != null) { @@ -4358,6 +4375,7 @@ public struct Track public bool IsNotAudio = false; public string FailureReason = ""; public string DownloadPath = ""; + public string Other = ""; public State TrackState = State.Initial; public SlDictionary? Downloads = null; @@ -4387,6 +4405,7 @@ public struct Track TrackState = other.TrackState; FailureReason = other.FailureReason; DownloadPath = other.DownloadPath; + Other = other.Other; } public override readonly string ToString() diff --git a/slsk-batchdl/Utils.cs b/slsk-batchdl/Utils.cs index 04e9ff6..1f508f7 100644 --- a/slsk-batchdl/Utils.cs +++ b/slsk-batchdl/Utils.cs @@ -1,6 +1,5 @@ using System.Net; using System.Text.RegularExpressions; -using System.Xml.Linq; public static class Utils { @@ -112,7 +111,7 @@ public static class Utils public static string RemoveConsecutiveWs(this string input) { - return Regex.Replace(input, @"\s+", " "); + return string.Join(' ', input.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries)); } public static string RemoveSquareBrackets(this string str) diff --git a/slsk-batchdl/YouTube.cs b/slsk-batchdl/YouTube.cs index 120469e..3075a76 100644 --- a/slsk-batchdl/YouTube.cs +++ b/slsk-batchdl/YouTube.cs @@ -6,10 +6,7 @@ using System.Text.RegularExpressions; using YoutubeExplode.Common; using System.Diagnostics; using HtmlAgilityPack; -using System.Text; -using System.Threading.Channels; using System.Collections.Concurrent; -using System; public static class YouTube { @@ -98,8 +95,8 @@ public static class YouTube var track = new Track(); track.URI = id; - uploader = Regex.Replace(uploader.Replace("–", "-").Trim(), @"\s+", " "); - title = Regex.Replace(title.Replace("–", "-").Trim(), @"\s+", " "); + uploader = uploader.Replace("–", "-").Trim().RemoveConsecutiveWs(); + title = title.Replace("–", "-").Replace(" -- ", " - ").Trim().RemoveConsecutiveWs(); var artist = uploader; var trackTitle = title; @@ -258,6 +255,13 @@ public static class YouTube return tracks; } + public static async Task GetPlaylistTitle(string url) + { + var youtube = new YoutubeClient(); + var playlist = await youtube.Playlists.GetAsync(url); + return playlist.Title; + } + public static async Task<(string, List)> GetTracksYtExplode(string url, int max = int.MaxValue, int offset = 0) { var youtube = new YoutubeClient(); @@ -305,12 +309,34 @@ public static class YouTube _client.Timeout = TimeSpan.FromSeconds(10); } - public async Task> RetrieveDeleted(string url) + public async Task> RetrieveDeleted(string url, bool printFailed = true) { var deletedVideoUrls = new BlockingCollection(); - var tracks = new ConcurrentBag(); - var process = new Process() + int totalCount = 0; + int archivedCount = 0; + var tracks = new ConcurrentBag(); + var noArchive = new ConcurrentBag(); + var failRetrieve = new ConcurrentBag(); + + int workerCount = 4; + var workers = new List(); + var consoleLock = new object(); + + void updateInfo() + { + lock (consoleLock) + { + if (!Console.IsOutputRedirected) + { + string info = "Deleted metadata total/archived/retrieved: "; + Console.SetCursorPosition(0, Console.CursorTop); + Console.Write($"{info}{totalCount}/{archivedCount}/{tracks.Count}"); + } + } + } + + var process = new Process { StartInfo = new ProcessStartInfo { @@ -319,14 +345,17 @@ public static class YouTube RedirectStandardOutput = true, UseShellExecute = false, CreateNoWindow = true, - } + }, + EnableRaisingEvents = true }; - process.EnableRaisingEvents = true; - bool ok = false; process.OutputDataReceived += (sender, e) => { - if (!ok) { Console.WriteLine("Got first video"); ok = true; } - deletedVideoUrls.Add(e.Data); + if (!string.IsNullOrWhiteSpace(e.Data)) + { + deletedVideoUrls.Add(e.Data); + Interlocked.Increment(ref totalCount); + updateInfo(); + } }; process.Exited += (sender, e) => { @@ -336,29 +365,42 @@ public static class YouTube process.Start(); process.BeginOutputReadLine(); - List workers = new List(); - int workerCount = 4; for (int i = 0; i < workerCount; i++) { workers.Add(Task.Run(async () => { foreach (var videoUrl in deletedVideoUrls.GetConsumingEnumerable()) { - var waybackUrl = await GetOldestArchiveUrl(videoUrl); - if (!string.IsNullOrEmpty(waybackUrl)) + var waybackUrls = await GetOldestArchiveUrls(videoUrl, limit: 2); + if (waybackUrls != null && waybackUrls.Count > 0) { - var x = await GetVideoDetails(waybackUrl); - if (!string.IsNullOrEmpty(x.title)) + Interlocked.Increment(ref archivedCount); + + bool good = false; + foreach (var waybackUrl in waybackUrls) { - var track = await ParseTrackInfo(x.title, x.uploader, waybackUrl, x.duration); - tracks.Add(track); - if (!Console.IsOutputRedirected) + var (title, uploader, duration) = await GetVideoDetails(waybackUrl); + if (!string.IsNullOrWhiteSpace(title)) { - Console.SetCursorPosition(0, Console.CursorTop); - Console.Write($"Deleted videos processed: {tracks.Count}"); + var track = await ParseTrackInfo(title, uploader, waybackUrl, duration); + track.Other = $"{{\"t\":\"{title.Trim()}\",\"u\":\"{uploader.Trim()}\"}}"; + tracks.Add(track); + good = true; + break; } } + + if (!good) + { + failRetrieve.Add(waybackUrls[0]); + } } + else + { + noArchive.Add(videoUrl); + } + + updateInfo(); } })); } @@ -367,12 +409,32 @@ public static class YouTube process.WaitForExit(); deletedVideoUrls.CompleteAdding(); Console.WriteLine(); + + if (printFailed) + { + if (archivedCount < totalCount) + { + Console.WriteLine("No archived version found for the following:"); + foreach (var x in noArchive) + Console.WriteLine($" {x}"); + Console.WriteLine(); + + } + if (tracks.Count < archivedCount) + { + Console.WriteLine("Failed to parse archived version for the following:"); + foreach (var x in failRetrieve) + Console.WriteLine($" {x}"); + Console.WriteLine(); + } + } + return tracks.ToList(); } - private async Task GetOldestArchiveUrl(string url) + private async Task> GetOldestArchiveUrls(string url, int limit) { - var url2 = $"http://web.archive.org/cdx/search/cdx?url={url}&fl=timestamp,original&filter=statuscode:200&sort=timestamp:asc&limit=1"; + var url2 = $"http://web.archive.org/cdx/search/cdx?url={url}&fl=timestamp,original&filter=statuscode:200&sort=timestamp:asc&limit={limit}"; HttpResponseMessage response = null; for (int i = 0; i < 3; i++) { @@ -388,13 +450,16 @@ public static class YouTube { var content = await response.Content.ReadAsStringAsync(); var lines = content.Split("\n").Where(line => !string.IsNullOrWhiteSpace(line)).ToList(); - if (lines.Any()) + if (lines.Count > 0) { - var parts = lines[0].Split(" "); - var timestamp = parts[0]; - var originalUrl = parts[1]; - var oldestArchive = $"http://web.archive.org/web/{timestamp}/{originalUrl}"; - return oldestArchive; + for (int i = 0; i < lines.Count; i++) + { + var parts = lines[i].Split(" "); + var timestamp = parts[0]; + var originalUrl = parts[1]; + lines[i] = $"http://web.archive.org/web/{timestamp}/{originalUrl}"; + } + return lines; } } return null; @@ -428,25 +493,43 @@ public static class YouTube foreach (var pattern in patterns) { var node = doc.DocumentNode.SelectSingleNode(pattern); - var res = ""; if (node != null) { + var res = ""; if (pattern.StartsWith("//meta") || pattern.Contains("@itemprop")) res = node.GetAttributeValue("content", ""); else res = node.InnerText; - if (!string.IsNullOrEmpty(res)) return res; + if (!string.IsNullOrEmpty(res)) + return Utils.UnHtmlString(res); } } return ""; } + var title = getItem(titlePatterns); + if (string.IsNullOrEmpty(title)) + { + var pattern = @"document\.title\s*=\s*""(.+?) - YouTube"";"; + var match = Regex.Match(doc.Text, pattern); + if (match.Success) + title = match.Groups[1].Value; + } + + var username = getItem(usernamePatterns); + int duration = -1; var node = doc.DocumentNode.SelectSingleNode("//meta[@itemprop='duration']"); if (node != null) - duration = (int)XmlConvert.ToTimeSpan(node.GetAttributeValue("content", "")).TotalSeconds; + { + try + { + duration = (int)XmlConvert.ToTimeSpan(node.GetAttributeValue("content", "")).TotalSeconds; + } + catch { } + } - return (getItem(titlePatterns), getItem(usernamePatterns), duration); + return (title, username, duration); } }