commit 6df757f719459ab9bfa818df7911bff30364deaf Author: Danny Robson Date: Sun Jul 7 13:57:02 2019 +1000 Initial import diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fcfee9e --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/users.json diff --git a/README.adoc b/README.adoc new file mode 100644 index 0000000..7f7c01d --- /dev/null +++ b/README.adoc @@ -0,0 +1,16 @@ +== doku2git + +=== About + +A small Python script that will generate a shall script that can convert an existing Dokuwiki site into a Git repository will (as close as possible to) full edit history. + +=== Usage + +Simple cases can be run as `./doku2git --src /foo/dokuwiki --dst /bar/git > script.sh` + +==== Arguments + +[horizontal] +src:: The absolute path to the Dokuwiki root +dst:: The absolute path to the Git repository +users:: A path to a JSON file containing an array of objects describing additional users. They must have the keys 'login', 'real_name', and 'email'. diff --git a/doku2git.py b/doku2git.py new file mode 100755 index 0000000..fca21d5 --- /dev/null +++ b/doku2git.py @@ -0,0 +1,419 @@ +#!/usr/bin/env python3 + + +############################################################################### +from typing import List, Mapping, Iterable, Callable, Generator +from operator import attrgetter + +import os.path +import os +import collections +import itertools + + +############################################################################### +import logging + +logging.basicConfig(level = logging.WARNING) +log = logging.getLogger() + + +############################################################################### +def find_index(haystack: Iterable, needle, comparator: Callable) -> int: + """ + Find the index of the first value within 'haystack' that compares as equal + to 'needle' using the supplied 'comparator' + """ + for index, data in zip(itertools.count(), haystack): + if comparator(data, needle): + return index + + raise ValueError("Needle not found") + + +############################################################################### +class User(object): + """ + Represents a single user within the Dokuwiki namespace. + + We deliberately do not store the password hash. + """ + + login: str + real_name: str + email: str + + def __init__(self, login, real_name, email): + self.login = login + self.real_name = real_name + self.email = email + + def author(self) -> str: + """ + Return authorship information formatted for `git commit`. + eg. 'A U Thor ' + """ + return f"{self.real_name} <{self.email}>" + + +##----------------------------------------------------------------------------- +class Change(object): + """ + The base class for all change records in the Dokuwiki instance. + + This class stores all the required data fields, but subclasses must + provide some required functionality; eg, path lookups, and history + storage, are different for Pages and Media. + """ + + timestamp: str + ip: str + operation: str + page: str + user: str + description: str + + KNOWN_OPERATIONS = "CEeD" + + def __init__(self, timestamp, ip, operation, page, user, description): + assert operation in Change.KNOWN_OPERATIONS, f"Operation '{operation}' is not handle" + + self.timestamp = timestamp + self.ip = ip + self.operation = operation + self.page = page + self.user = user + self.description = description + + def __str__(self): + return f'{self.user}@{self.timestamp}:{self.page} "{self.description}"' + + def _delete(self, src: str, dst: str) -> Generator[str, None, None]: + """ + Generates the required commands to delete a given page + :param src: The Dokuwiki instance the change belongs to + :param dst: The path to the root of the git repository. + """ + localpath = self.page.replace(':', os.path.sep) + yield f'git rm --quiet "{localpath}.txt"' + + def apply(self, src, dst: str) -> Generator[str, None, None]: + """ + Generate a list of commands to enact this changeset from the Dokuwiki + instance; eg, create, edit, or delete a page. + + The actual operation is not performed within this function. Instead, it's + dispatched to (potentially overridden) member functions. + + Some basic configure time checks are performed; primarily for the + existence of the required source files. + + :param src: The Dokuwiki instance this change comes from + :param dst: The path to the root of the git repository. + """ + log.info(f"Applying {self.operation}: '{self.description}'") + + # If we're not trying to delete a page then ensure we can see the + # source material in the Dokuwiki attic + if self.operation not in ('D'): + attic = os.path.join(src.root, self._attic(src)) + if not os.path.isfile(attic): + log.error(f'Source file {attic} not present for {self}; ignoring') + return + + # Find the function that will apply this update. Deliberately + # construct this to throw if the operation isn't handled. + func = { + 'C': self._update, + 'E': self._update, + 'e': self._update, + 'D': self._delete + }[self.operation] + + # Generate the subcommands using the update function, but protect + # against failure. + for cmd in func(src, dst): + yield f'{cmd} || die "{self.timestamp}"' + + # Finalise the update with a commit + yield " ".join([ + 'git commit', + '--quiet', + '--allow-empty', + '--allow-empty-message', + f'--author="{src.users[self.user].author()}"', + f' --date="{self.timestamp} +0000"', + f'-m "{self.description}"', + f'|| die "{self.timestamp}"' + ]) + + +##----------------------------------------------------------------------------- +class PageChange(Change): + """ + Represents one change to one page in the Dokuwiki instance. + """ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def _attic(self, src) -> str: + """ + Find the path to the history data within the Dokuwiki directories. + :param src: The Dokuwiki instance this change belongs to. + :return: The relative path from the root of Dokuwiki to the compressed + data. This is always gzip compressed. + """ + local = self.page.replace(':', os.path.sep) + + return os.path.join( + 'data', + 'attic', + f'{local}.{self.timestamp}.txt.gz' + ) + + def _update(self, src, dst: str) -> Generator[str, None, None]: + """ + Yield the commands required to unpack this change. + + It is assumed we don't need to remove old data and can just uncompress + over the top of the old data. + :param src: The Dokuwiki instance this change belongs to. + :param dst: The absolute path to the root of the git repository + :return: + """ + localpath = self.page.replace(':', os.path.sep) + + dstpath = os.path.join(dst, localpath) + dstdir = os.path.dirname(dstpath) + + attic = os.path.join(src.root, self._attic(src)) + + cmds = [ + f'mkdir -p "{dstdir}"', + f'gunzip -c "{attic}" > "{dstpath}.txt"', + f'git add "{localpath}".txt', + + ] + + for c in cmds: + yield c + + +##----------------------------------------------------------------------------- +class MediaChange(Change): + """ + Represents changes to one media file in a Dokuwiki instance. + """ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def _attic(self, src): + """ + Calculates the path to the source data required for this change. + :param src: The Dokuwiki instance this change belongs to + :return: The absolute to the data + """ + local = self.page.replace(':', os.path.sep) + + # If we're the most recent iteration of the file then we return a path + # directly to the live media directory. + # + # Keep in mind that we search from the newest-to-oldest (due to the + # reversed sorting in media list records). + # + # There are some situations where we get two edits with the same + # timestamp; but we can't have additional entries in `media_attic` for + # the same timestamp (and there doesn't appear to be an alternative + # method to store these changes) + # + # We treat this as if they can be collapsed into one change for the + # purposes of path lookups. + changes = src.media[self.page] + index = find_index(changes, self, lambda a, b: a.timestamp == b.timestamp) + if index == 0: + return os.path.join(src.root, 'data', 'media', local) + + # We're a historical entry, so we need to find the backup within + # media_attic + basename, extension = os.path.splitext(local) + return os.path.join( + src.root, + 'data', + 'media_attic', + f'{basename}.{self.timestamp}{extension}' + ) + + def _update(self, src: str, dst: str) -> Generator[str, None, None]: + """ + Yields the commands required to unpack this one change. + :param src: The Dokuwiki instance this change belongs to. + :param dst: The absolute path to the root of the git repository + :return: + """ + localpath = self.page.replace(':', os.path.sep) + localdir = os.path.dirname(localpath) + + yield f'cp "{self._attic(src)}" "{os.path.join(dst, localdir)}"' + + +############################################################################### +class Dokuwiki: + """ + Represents data for a single Dokuwiki instance. + + It is not safe to use this on an installation that may be receiving + updates; but it should be safe to use on a live site without active + editors. Either way it will never rewrite any data. + """ + + users: Mapping[str, User] + media: Mapping[str, List[Change]] + changes: List[Change] + + def _record_media(self, change): + self.media[change.page].append(change) + + def __init__(self, root: str): + self.root = root + self.users = self._find_users() + self.media = collections.defaultdict(list) + + page_generator = self._find_meta(os.path.join (self.root, 'data', 'meta'), PageChange) + media_generator = self._find_meta(os.path.join (self.root, 'data', 'media_meta'), MediaChange) + + self.changes = list(page_generator) + for entry in media_generator: + self._record_media(entry) + self.changes.append(entry) + + self.changes.sort(key=attrgetter('timestamp')) + for k,v in self.media.items(): + v.sort(key=attrgetter('timestamp'), reverse=True) + + def _find_users(self) -> Mapping[str, User]: + """ + Parses the users.auth.php file to discover all listed users. + :return: A mapping of login to User objects + """ + found = {} + + auth_path = os.path.join(self.root, 'conf', 'users.auth.php') + with open(auth_path, 'r') as auth_file: + for line in auth_file: + # Make sure the line actually contains some data + line = line.rstrip() + if len(line) == 0: + continue + if line[0] == '#': + continue + + login, password, real_name, email, groups, *tail = line.split(':') + if len(tail) > 0: + log.warning("Found extra components when parsing users.auth.php") + + found[login] = User(login=login, real_name=real_name, email=email) + + found[None] = User(login="system", real_name="system", email="system@mlug-au.org") + return found + + + def _find_meta(self, root: str, type): + for dirpath, dirnames, filenames in os.walk(root): + for file in filenames: + # We don't really care about 'indexed' or 'meta' files. + base, ext = os.path.splitext(file) + if ext != '.changes': + continue + + # Some paths are autogenerated listings of content that aren't + # relevant to us (and drastically complicate processing) + if base in ('_media', '_dokuwiki'): + continue + + # Actually read the metadata + path = os.path.join(dirpath, file) + for change in self._read_meta(path, type): + yield change + + + def _read_meta(self, path: str, type): + log.debug("extracting meta from %s", path) + with open(path, 'r') as data: + for entry in data: + timestamp, ip, operation, page, user, description, *tail = entry.split('\t') + + if len(tail) > 1: + log.warning("Dropping meta change data") + else: + assert tail[0] == '\n' + + if not user: + if description != "external edit": + raise RuntimeError("Empty user doesn't correspond to 'external edit'") + user = None + + yield type( + timestamp=timestamp, + ip=ip, + operation=operation, + page=page, + user=user, + description=description + ) + + +############################################################################### +if __name__ == "__main__": + import argparse + import json + + header = [ + '#!/usr/bin/env bash', + 'die() { echo "$*" 1>&2 ; exit 1; }', + ] + + def preamble(src, dst: str): + cmds = [ + f'git init "{dst}"', + f'cd "{dst}"', + ] + + for c in cmds: + yield c + + def finish(src, dst: str): + arguments = [ + f'git commit', + ' --quiet', + '--allow-empty', + '--author="doku2git@mlug-au.org "', + '-m "Converted dokuwiki to git"' + ] + yield " ".join(arguments) + + def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--src", required=True, help="The root directory of the dokuwiki source") + parser.add_argument("--dst", required=True, help="The git output directory") + parser.add_argument("--users", type=str, help="A JSON serialised list of supplementary users") + + args = parser.parse_args() + + src = Dokuwiki(root=args.src) + + if args.users: + with open(args.users, 'r') as users: + for user in json.load(users): + src.users[user['login']] = User(**user) + + todo = [ + header, + preamble(src, args.dst), + *(change.apply(src, args.dst) for change in src.changes), + finish(src, args.dst) + ] + + for cmd in itertools.chain(*todo): + print(cmd) + + main()