From 785d89d6ab29c36f3d5a8ea1dbdf9dd5449d04a4 Mon Sep 17 00:00:00 2001 From: Thomas Miceli <27960254+thomiceli@users.noreply.github.com> Date: Sat, 27 Apr 2024 01:49:53 +0200 Subject: [PATCH] Rework git log parsing and truncating (#260) --- internal/git/commands.go | 4 +- internal/git/commands_test.go | 2 +- internal/git/output_parser.go | 369 +++++++++++++++++++++++---------- templates/pages/revisions.html | 2 +- 4 files changed, 268 insertions(+), 109 deletions(-) diff --git a/internal/git/commands.go b/internal/git/commands.go index 6850d8f..2bd0645 100644 --- a/internal/git/commands.go +++ b/internal/git/commands.go @@ -24,6 +24,8 @@ var ( ) const truncateLimit = 2 << 18 +const diffSize = 2 << 12 +const maxFilesPerDiffCommit = 10 type RevisionNotFoundError struct{} @@ -313,7 +315,7 @@ func GetLog(user string, gist string, skip int) ([]*Commit, error) { } }(cmd) - return parseLog(stdout, truncateLimit), err + return parseLog(stdout, maxFilesPerDiffCommit, diffSize) } func CloneTmp(user string, gist string, gistTmpId string, email string, remove bool) error { diff --git a/internal/git/commands_test.go b/internal/git/commands_test.go index efa761d..d17a98b 100644 --- a/internal/git/commands_test.go +++ b/internal/git/commands_test.go @@ -125,7 +125,7 @@ like Opengist actually`, require.Contains(t, commits[0].Files, File{ Filename: "my_other_file.txt", - OldFilename: "", + OldFilename: "my_other_file.txt", Content: `@@ -1,2 +1,2 @@ I really -hate Opengist diff --git a/internal/git/output_parser.go b/internal/git/output_parser.go index 82ebea4..cefb9d8 100644 --- a/internal/git/output_parser.go +++ b/internal/git/output_parser.go @@ -6,7 +6,6 @@ import ( "encoding/csv" "fmt" "io" - "regexp" "strings" ) @@ -63,129 +62,287 @@ func truncateCommandOutput(out io.Reader, maxBytes int64) (string, bool, error) return string(buf), truncated, nil } -func parseLog(out io.Reader, maxBytes int) []*Commit { - scanner := bufio.NewScanner(out) - +// inspired from https://github.com/go-gitea/gitea/blob/main/services/gitdiff/gitdiff.go +func parseLog(out io.Reader, maxFiles int, maxBytes int) ([]*Commit, error) { var commits []*Commit var currentCommit *Commit var currentFile *File - var isContent bool - var bytesRead = 0 - scanNext := true + var headerParsed = false + var skipped = false + var line string + var err error + + input := bufio.NewReaderSize(out, maxBytes) + + // Loop Commits +loopLog: + for { + // If a commit was skipped, do not read a new line + if !skipped { + line, err = input.ReadString('\n') + if err != nil { + if err == io.EOF { + break loopLog + } + return commits, err + } + } + + // Remove trailing newline characters + if len(line) > 0 && (line[len(line)-1] == '\n' || line[len(line)-1] == '\r') { + line = line[:len(line)-1] + } + + // Attempt to parse commit header (hash, author, mail, timestamp) or a diff + switch line[0] { + // Commit hash + case 'c': + if headerParsed { + commits = append(commits, currentCommit) + } + skipped = false + currentCommit = &Commit{Hash: line[2:], Files: []File{}} + continue + + // Author name + case 'a': + headerParsed = true + currentCommit.AuthorName = line[2:] + continue + + // Author email + case 'm': + currentCommit.AuthorEmail = line[2:] + continue + + // Commit timestamp + case 't': + currentCommit.Timestamp = line[2:] + continue + + // Commit shortstat + case ' ': + changed := []byte(line)[1:] + changed = bytes.ReplaceAll(changed, []byte("(+)"), []byte("")) + changed = bytes.ReplaceAll(changed, []byte("(-)"), []byte("")) + currentCommit.Changed = string(changed) + + // shortstat is followed by an empty line + line, err = input.ReadString('\n') + if err != nil { + if err == io.EOF { + break loopLog + } + return commits, err + } + continue + + // Commit diff + default: + // Loop files in diff + loopCommit: + for { + // If we have reached the maximum number of files to show for a single commit, skip to the next commit + if len(currentCommit.Files) >= maxFiles { + line, err = skipToNextCommit(input) + if err != nil { + if err == io.EOF { + break loopLog + } + return commits, err + } + + // Skip to the next commit + headerParsed = false + skipped = true + break loopCommit + } + + // Else create a new file and parse it + currentFile = &File{} + parseRename := true + + loopFileDiff: + for { + line, err = input.ReadString('\n') + if err != nil { + if err != io.EOF { + return commits, err + } + headerParsed = false + break loopCommit + } + + // If the line is a newline character, the commit is finished + if line == "\n" { + currentCommit.Files = append(currentCommit.Files, *currentFile) + headerParsed = false + break loopCommit + } + + // Attempt to parse the file header + switch { + case strings.HasPrefix(line, "diff --git"): + currentCommit.Files = append(currentCommit.Files, *currentFile) + headerParsed = false + break loopFileDiff + case strings.HasPrefix(line, "old mode"): + case strings.HasPrefix(line, "new mode"): + case strings.HasPrefix(line, "index"): + case strings.HasPrefix(line, "similarity index"): + case strings.HasPrefix(line, "dissimilarity index"): + continue + case strings.HasPrefix(line, "rename from "): + currentFile.OldFilename = line[12 : len(line)-1] + case strings.HasPrefix(line, "rename to "): + currentFile.Filename = line[10 : len(line)-1] + parseRename = false + case strings.HasPrefix(line, "copy from "): + currentFile.OldFilename = line[10 : len(line)-1] + case strings.HasPrefix(line, "copy to "): + currentFile.Filename = line[8 : len(line)-1] + parseRename = false + case strings.HasPrefix(line, "new file"): + currentFile.IsCreated = true + case strings.HasPrefix(line, "deleted file"): + currentFile.IsDeleted = true + case strings.HasPrefix(line, "--- "): + name := line[4 : len(line)-1] + if parseRename && currentFile.IsDeleted { + currentFile.Filename = name[2:] + } else if parseRename && strings.HasPrefix(name, "a/") { + currentFile.OldFilename = name[2:] + } + case strings.HasPrefix(line, "+++ "): + name := line[4 : len(line)-1] + if parseRename && strings.HasPrefix(name, "b/") { + currentFile.Filename = name[2:] + } + + // Header is finally parsed, now we can parse the file diff content + lineBytes, isFragment, err := parseDiffContent(currentFile, maxBytes, input) + if err != nil { + if err != io.EOF { + return commits, err + } + + // EOF reached, commit is finished + currentCommit.Files = append(currentCommit.Files, *currentFile) + headerParsed = false + break loopCommit + } + + currentCommit.Files = append(currentCommit.Files, *currentFile) + + if string(lineBytes) == "" { + headerParsed = false + break loopCommit + } + + for isFragment { + _, isFragment, err = input.ReadLine() + if err != nil { + return commits, fmt.Errorf("unable to ReadLine: %w", err) + } + } + + break loopFileDiff + } + } + } + } + commits = append(commits, currentCommit) + } + + return commits, nil +} + +func parseDiffContent(currentFile *File, maxBytes int, input *bufio.Reader) (lineBytes []byte, isFragment bool, err error) { + sb := &strings.Builder{} + var currFileLineCount int for { - if scanNext && !scanner.Scan() { - break - } - scanNext = true + for isFragment { + currentFile.Truncated = true - // new commit found - currentFile = nil - currentCommit = &Commit{Hash: string(scanner.Bytes()[2:]), Files: []File{}} - - scanner.Scan() - currentCommit.AuthorName = string(scanner.Bytes()[2:]) - - scanner.Scan() - currentCommit.AuthorEmail = string(scanner.Bytes()[2:]) - - scanner.Scan() - currentCommit.Timestamp = string(scanner.Bytes()[2:]) - - scanner.Scan() - - if len(scanner.Bytes()) == 0 { - commits = append(commits, currentCommit) - break + // Read the next line + _, isFragment, err = input.ReadLine() + if err != nil { + return nil, false, err + } } - // if there is no shortstat, it means that the commit is empty, we add it and move onto the next one - if scanner.Bytes()[0] != ' ' { - commits = append(commits, currentCommit) + sb.Reset() - // avoid scanning the next line, as we already did it - scanNext = false + // Read the next line + lineBytes, isFragment, err = input.ReadLine() + if err != nil { + if err == io.EOF { + return lineBytes, isFragment, err + } + return nil, false, err + } + + // End of file + if len(lineBytes) == 0 { + return lineBytes, false, err + } + if lineBytes[0] == 'd' { + return lineBytes, false, err + } + + if currFileLineCount >= maxBytes { + currentFile.Truncated = true continue } - changed := scanner.Bytes()[1:] - changed = bytes.ReplaceAll(changed, []byte("(+)"), []byte("")) - changed = bytes.ReplaceAll(changed, []byte("(-)"), []byte("")) - currentCommit.Changed = string(changed) - - // twice because --shortstat adds a new line - scanner.Scan() - scanner.Scan() - // commit header parsed - - // files changes inside the commit - for { - line := scanner.Bytes() - - // end of content of file - if len(line) == 0 { - isContent = false - if currentFile != nil { - currentCommit.Files = append(currentCommit.Files, *currentFile) - } - break - } - - // new file found - if bytes.HasPrefix(line, []byte("diff --git")) { - // current file is finished, we can add it to the commit - if currentFile != nil { - currentCommit.Files = append(currentCommit.Files, *currentFile) - } - - // create a new file - isContent = false - bytesRead = 0 - currentFile = &File{} - filenameRegex := regexp.MustCompile(`^diff --git a/(.+) b/(.+)$`) - matches := filenameRegex.FindStringSubmatch(string(line)) - if len(matches) == 3 { - currentFile.Filename = matches[2] - if matches[1] != matches[2] { - currentFile.OldFilename = matches[1] - } - } - scanner.Scan() - continue - } - - if bytes.HasPrefix(line, []byte("new")) { - currentFile.IsCreated = true - } - - if bytes.HasPrefix(line, []byte("deleted")) { - currentFile.IsDeleted = true - } - - // file content found - if line[0] == '@' { - isContent = true - } - - if isContent { - currentFile.Content += string(line) + "\n" - - bytesRead += len(line) - if bytesRead > maxBytes { - currentFile.Truncated = true - currentFile.Content = "" - isContent = false + line := string(lineBytes) + if isFragment { + currentFile.Truncated = true + for isFragment { + lineBytes, isFragment, err = input.ReadLine() + if err != nil { + return lineBytes, isFragment, fmt.Errorf("unable to ReadLine: %w", err) } } - - scanner.Scan() } - commits = append(commits, currentCommit) - + if len(line) > maxBytes { + currentFile.Truncated = true + line = line[:maxBytes] + } + currentFile.Content += line + "\n" } +} - return commits +func skipToNextCommit(input *bufio.Reader) (line string, err error) { + // need to skip until the next cmdDiffHead + var isFragment, wasFragment bool + var lineBytes []byte + for { + lineBytes, isFragment, err = input.ReadLine() + if err != nil { + return "", err + } + if wasFragment { + wasFragment = isFragment + continue + } + if bytes.HasPrefix(lineBytes, []byte("c")) { + break + } + wasFragment = isFragment + } + line = string(lineBytes) + if isFragment { + var tail string + tail, err = input.ReadString('\n') + if err != nil { + return "", err + } + line += tail + } + return line, err } func ParseCsv(file *File) (*CsvFile, error) { diff --git a/templates/pages/revisions.html b/templates/pages/revisions.html index 69a197b..5c30edf 100644 --- a/templates/pages/revisions.html +++ b/templates/pages/revisions.html @@ -35,7 +35,7 @@ {{ $file.Filename }}({{ $.locale.Tr "gist.revision.file-created" }}) {{ else if $file.IsDeleted }} {{ $file.Filename }} ({{ $.locale.Tr "gist.revision.file-deleted" }}) - {{ else if ne $file.OldFilename "" }} + {{ else if ne $file.OldFilename $file.Filename }} {{ $file.OldFilename }} {{ $.locale.Tr "gist.revision.file-renamed" }} {{ $file.Filename }} {{ else }} {{ $file.Filename }}