#!/usr/bin/env python3
#
# git-restore-mtime - Change mtime of files based on commit date of last change
#
#    Copyright (C) 2012 Rodrigo Silva (MestreLion) <linux@rodrigosilva.com>
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program. See <http://www.gnu.org/licenses/gpl.html>
#
"""
Change the modification time (mtime) of all files in work tree, based on the
date of the most recent commit that modified the file.

Useful prior to generating release tarballs, so each file is archived with a
date that is similar to the date when the file was actually last modified,
assuming the actual modification date and its commit date are close.

By default ignores all ignored and untracked files, and also refuses to work
on trees with uncommitted changes.
"""

# TODO:
# - Gracefully abort on KeyboardInterrupt (CTRL+C), with sys.exit(1). Require all code
#   to be wrapped in main() function
# - Add -z on git whatchanged/ls-files so we don't deal with filename decode/OS normalization
# - When Python is bumped to 3.7, use text instead of universal_newlines on subprocess
# - Update "Statistics for some large projects" with modern hardware and repositories.
# - Create a README.md for git-restore-mtime alone. It deserves extensive documentation
#   - Move Statistics there

# FIXME:
# - Given lsfileslist, why do we still need filelist/os.walk routine? Should be changed to:
#   - Process files on lsfileslist. It already discarded paths outside repo and '.git/'
#   repo dir, and any missing files on disk makes repo dirty, already checked above.
#   - dirlist should also be extracted from lsfileslist.
#   - Discard symlinks on unsupported platforms
# - When current dir is outside the worktree, e.g. using --work-tree, `git ls-files`
#   assume any relative pathspecs are to worktree root, while filelist routine always
#   assume relative to current directory. As is, absolute paths are required, it's
#   impossible to match pathspecs using relative paths
# - Renames and mode changes should not change file mtime:
#   - Must check on status 'R100' and mode changes with same blobs
#   - Should require status to be (A, C, M, R<100, T). D will never be processed as
#     filelist is a subset of lsfileslist.
# - Check file (A, D) for directory mtime is not sufficient:
#   - Renames also change dir mtime, unless rename was on a parent dir
#   - If most recent change of all files in a dir was a [M]odification,
#     dir might not be touched at all.
#   - Dirs containing only subdirectories but no direct files will also
#     not be touched. They're files' [grand]parent dir, but never their dirname().
#   - Some solutions:
#     - After files done, perform some dir processing for missing dirs, finding latest
#       file (A, D, R)
#     - Simple approach: dir mtime is most recent child (dir or file) mtime
#     - Use a virtual concept of "created at most at" to fill missing info, bubble up
#       to parents and grandparents
#   - When handling [grand]parent dirs, stay inside <pathspec>
# - Better handling of merge commits. `-m` is plain *wrong*. `-c/--cc` is perfect, but
#   painfully slow. First pass without merge commits is not accurate. Maybe add a new
#   `--accurate` mode for `--cc`?

if __name__ != "__main__":
    raise ImportError("{} should not be used as a module.".format(__name__))

import subprocess, shlex
import sys, os.path
import logging
import argparse
import time


# Update symlinks only if the OS supports not following them
UPDATE_SYMLINKS = bool(os.utime in getattr(os, 'supports_follow_symlinks', []))


# Command-line interface ######################################################

def parse_args():
    parser = argparse.ArgumentParser(
        description="""Restore original modification time of files based on the date of the
        most recent commit that modified them. Useful when generating release tarballs.""")

    group = parser.add_mutually_exclusive_group()
    group.add_argument('--quiet', '-q', dest='loglevel',
        action="store_const", const=logging.WARNING, default=logging.INFO,
        help="Suppress informative messages and summary statistics.")
    group.add_argument('--verbose', '-v', action="count",
        help="Print additional information for each processed file.")

    parser.add_argument('--force', '-f', action="store_true",
        help="Force execution on trees with uncommitted changes.")

    parser.add_argument('--merge', '-m', action="store_true",
        help="""Include merge commits. Leads to more recent mtimes and more files per
        commit, thus with the same mtime (which may or may not be what you want). Including
        merge commits may lead to less commits being evaluated (all files are found sooner),
        which improves performance, sometimes substantially. But since merge commits are
        usually huge, processing them may also take longer, sometimes substantially.
        By default merge logs are only used for files missing from regular commit logs.""")

    parser.add_argument('--first-parent', action="store_true",
        help="""Consider only the first parent, the "main branch", when parsing merge
        commit logs. Only effective when merge commits are included in the log, either
        by --merge or to find missing files after first log parse. See --skip-missing.""")

    parser.add_argument('--skip-missing', '-s',
        action="store_false", default=True, dest="missing",
        help="""Do not try to find missing files. If some files were not found in regular
        commit logs, by default it re-tries using merge commit logs for these files (if
        --merge was not already used). This option disables this behavior, which may slightly
        improve performance, but files found only in merge commits will not be updated.""")

    parser.add_argument('--no-directories', '-D',
        action="store_false", default=True, dest='dirs',
        help="""Do not update directory mtime for files created, renamed or deleted in it.
        Note: just modifying a file will not update its directory mtime.""")

    parser.add_argument('--test', '-t', action="store_true", default=False,
        help="Test run: do not actually update any file")

    parser.add_argument('--commit-time', '-c',
        action='store_true', default=False, dest='commit_time',
        help="Use commit time instead of author time")

    parser.add_argument('pathspec', nargs='*', metavar='PATH',
        help="""Only modify paths matching PATH, directories or files, relative to current
        directory. Default is to modify all files handled by git, ignoring untracked files
        and submodules.""")

    parser.add_argument('--work-tree', dest='workdir',
        help="Path to the work tree, if not current directory or one of its parents.")

    parser.add_argument('--git-dir', dest='gitdir',
        help="Path to the git repository, if not the default <work-tree-root>/.git")

    return parser.parse_args()


# Helper functions #########################################################

def setup_logging(args):
    TRACE = logging.DEBUG // 2
    logging.Logger.trace = lambda _, m, *a, **k: _.log(TRACE, m, *a, **k)
    level = (args.verbose and max(TRACE, logging.DEBUG // args.verbose)) or args.loglevel
    logging.basicConfig(level=level, format='%(message)s')
    return logging.getLogger()


def normalize(path):
    """Normalize paths from git, handling non-ASCII characters.

    Git for Windows, as of v1.7.10, stores paths as UTF-8 normalization form C. If path
    contains non-ASCII or non-printable chars it outputs the UTF-8 in octal-escaped
    notation, double-quoting the whole path. Double-quotes and backslashes are also escaped.

    https://git-scm.com/docs/git-config#Documentation/git-config.txt-corequotePath
    https://github.com/msysgit/msysgit/wiki/Git-for-Windows-Unicode-Support
    https://github.com/git/git/blob/master/Documentation/i18n.txt

    Example on git output, this function reverts this:
    r'back\slash_double"quote_açaí' -> r'"back\\slash_double\"quote_a\303\247a\303\255"'
    """
    if path and path[0] == '"':
        # Python 2: path = path[1:-1].decode("string-escape")
        # Python 3: https://stackoverflow.com/a/46650050/624066
        path = (path[1:-1]                 # Remove enclosing double quotes
                .encode('latin1')          # Convert to bytes, required 'unicode-escape'
                .decode('unicode-escape')  # Perform the actual octal-escaping decode
                .encode('latin1')          # 1:1 mapping to bytes, forming UTF-8 encoding
                .decode('utf8'))           # Decode from UTF-8
    # Make sure the slash matches the OS; for Windows we need a backslash
    return os.path.normpath(path)


if UPDATE_SYMLINKS:
    def touch(path, mtime, test=False):
        """The actual mtime update"""
        if test: return
        os.utime(path, (mtime, mtime), follow_symlinks=False)
else:
    def touch(path, mtime, test=False):
        """The actual mtime update"""
        if test: return
        os.utime(path, (mtime, mtime))


def isodate(secs):
    return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(secs))


class Git():
    def __init__(self, workdir=None, gitdir=None):
        self.gitcmd = ['git']
        if workdir: self.gitcmd.extend(('--work-tree', workdir))
        if gitdir : self.gitcmd.extend(('--git-dir',   gitdir))

    def ls_files(self, pathlist=None):
        return (normalize(_) for _ in self._run('ls-files --full-name', pathlist))

    def is_dirty(self):
        return bool(self._run('diff --no-ext-diff --quiet', output=False))

    def repodirs(self):
        return (os.path.normpath(_) for _ in
                self._run('rev-parse --show-toplevel --absolute-git-dir', check=True))

    def log(self, merge=False, first_parent=False, commit_time=False, pathlist=None):
        cmd = 'whatchanged --pretty={}'.format('%ct' if commit_time else '%at')
        if merge:        cmd += ' -m'
        if first_parent: cmd += ' --first-parent'
        return self._run(cmd, pathlist)

    def _run(self, cmdstr, pathlist=None, output=True, check=False):
        cmdlist = self.gitcmd + shlex.split(cmdstr)
        if pathlist:
            cmdlist.append('--')
            cmdlist.extend(pathlist)
        log.trace("Executing: %s", ' '.join(cmdlist))
        if not output:
            return subprocess.call(cmdlist)
        if check:
            try:
                stdout = subprocess.check_output(cmdlist, universal_newlines=True)
                return stdout.splitlines()
            except subprocess.CalledProcessError as e:
                raise self.Error(e.returncode, e.cmd, e.output, e.stderr)
        self.proc = subprocess.Popen(cmdlist, stdout=subprocess.PIPE, universal_newlines=True)
        return (_.strip() for _ in self.proc.stdout)

    class Error(subprocess.CalledProcessError): pass


# Main Logic ##################################################################

args = parse_args()
log = setup_logging(args)
log.trace("Arguments: %s", args)

# UI done, it's show time!
start = time.time()  # yes, Wall time. CPU time is not realistic for users.

loglines = commits = touches = errors = dirtouches = direrrors = 0
stepmissing = 100

git = Git(args.workdir, args.gitdir)

# First things first: Where and Who are we?
try:
    workdir, gitdir = git.repodirs()
except git.Error as e:
    # Not in a git repository, and git already informed user on stderr. So we just...
    sys.exit(e.returncode)

# Do not work on dirty repositories, unless --force
if not args.force and git.is_dirty():
    log.critical(
     "ERROR: There are local changes in the working directory.\n"
     "This could lead to undesirable results for modified files.\n"
     "Please, commit your changes (or use --force) and try again.\n"
     "Aborting")
    sys.exit(1)

# Get the files managed by git
lsfileslist = set(git.ls_files(args.pathspec))

# List files matching user pathspec, relative to current directory
# git commands always print paths relative to work tree root
filelist = set()
dirlist  = set()
for path in (args.pathspec or (args.workdir or '.',)):

    # Normalize user input so ./doc = doc/ = doc/../doc/. = doc
    path = os.path.normpath(path)

    # Is path inside the work tree?
    if os.path.commonprefix([workdir, os.path.abspath(path)]) != workdir:
        log.warning("WARNING: Skipping pathspec outside work tree: %s", path)
        continue

    # git does not care if it's a broken symlink, hence lexists
    if not os.path.lexists(path):
        log.warning("WARNING: Skipping non-existing pathspec: %s", path)
        continue

    # file or symlink (to file, to dir or broken - git handles the same way)
    islink = os.path.islink(path)
    if os.path.isfile(path) or islink:
        if islink and not UPDATE_SYMLINKS:
            log.warning("WARNING: Skipping symlink, OS does not support update: %s", path)
            continue
        # Always add them relative to worktree root
        filelist.add(os.path.relpath(path, workdir))

    # dir
    else:
        for root, subdirs, files in os.walk(path):
            if gitdir in [os.path.abspath(os.path.join(root, subdir))
                          for subdir in subdirs]:
                subdirs.remove(os.path.basename(gitdir))

            if os.path.abspath(root) == workdir and '.git' in files:
                files.remove('.git')

            if args.dirs:
                dirname = os.path.relpath(root, workdir)
                if dirname == '.':
                    dirname = ''  # Like git does
                dirlist.add(dirname)

            for file in files:
                # Always add them relative to worktree root
                filelist.add(os.path.relpath(os.path.join(root, file), workdir))

filelist &= lsfileslist

totalfiles = files = len(filelist)
log.info("{0:,} files to be processed in work dir".format(totalfiles))

if not filelist:
    # Nothing to do. Exit silently and without errors, just like git does
    sys.exit(0)

# Process the log until all files are 'touched'
log.debug("Line #\tLog #\tF.Left\tModification Time\tFile Name")
def parselog(args, merge=False, filterlist=None):
    global loglines, commits, touches, errors, dirtouches, direrrors, files

    mtime = 0
    for line in git.log(merge, args.first_parent, args.commit_time, filterlist):
        loglines += 1

        # Blank line between Date and list of files
        if not line: continue

        # File line
        if line[0] == ':':  # Faster than line.startswith(':')
            # If line describes a rename, linetok has three tokens, otherwise two
            linetok = line.split('\t')
            status = linetok[0]
            file = linetok[-1]

            # Handles non-ASCII chars and OS path separator
            file = normalize(file)

            if file in filelist:
                log.debug("%d\t%d\t%d\t%s\t%s",
                             loglines, commits, files,
                             isodate(mtime), file)
                filelist.remove(file)
                files -= 1
                try:
                    touch(os.path.join(workdir, file), mtime, args.test)
                    touches += 1
                except Exception as e:
                    log.error("ERROR: %s", e)
                    errors += 1

            if args.dirs:
                dirname = os.path.dirname(file)
                if status[-1] in ('A', 'D') and dirname in dirlist:
                    log.debug("%d\t%d\t-\t%s\t%s",
                                 loglines, commits,
                                 isodate(mtime), "{}/".format(dirname or '.'))
                    dirlist.remove(dirname)
                    try:
                        touch(os.path.join(workdir, dirname), mtime, args.test)
                        dirtouches += 1
                    except Exception as e:
                        log.error("ERROR: %s", e)
                        direrrors += 1

        # Date line
        else:
            commits += 1
            mtime = int(line)

        # All files done?
        if not files:
            git.proc.terminate()  # hackish, but does the job. Not needed anyway
            return


parselog(args, args.merge, args.pathspec)

# Missing files
if filelist:

    # Try to find them in merge logs, if not done already
    # (usually HUGE, thus MUCH slower!)
    if args.missing and not args.merge:
        filterlist = list(filelist)
        for i in range(0, len(filterlist), stepmissing):
            parselog(args, merge=True, filterlist=filterlist[i:i+stepmissing])

    # Still missing some?
    for file in filelist:
        log.warning("WARNING: not found in log: %s", file)


# Final statistics
# Suggestion: use git-log --before=mtime to brag about skipped log entries
log.info(
    "Statistics:\n"
    "{:13,.2f} seconds\n"
    "{:13,} log lines processed\n"
    "{:13,} commits evaluated"
    "".format(time.time()-start, loglines, commits))

if args.dirs:
    if direrrors: log.info("{:13,} directory update errors".format(direrrors))
    log.info("{:13,} directories updated".format(dirtouches))

if touches != totalfiles: log.info("{:13,} files".format(totalfiles))
if files:                 log.info("{:13,} files missing".format(files))
if errors:                log.info("{:13,} file update errors".format(errors))

log.info("{:13,} files updated".format(touches))

if args.test:
    log.info("TEST RUN - No files modified!")
