#!/usr/bin/env python3 ############################################################################### from typing import List, Dict, Iterable, Callable, Generator from operator import attrgetter import os.path import os import collections import itertools ############################################################################### import logging logging.basicConfig(level=logging.WARNING) log = logging.getLogger() ############################################################################### def find_index(haystack: Iterable, needle, comparator: Callable) -> int: """ Find the index of the first value within 'haystack' that compares as equal to 'needle' using the supplied 'comparator' """ for index, data in zip(itertools.count(), haystack): if comparator(data, needle): return index raise ValueError("Needle not found") ############################################################################### class User(object): """ Represents a single user within the Dokuwiki namespace. We deliberately do not store the password hash. """ login: str real_name: str email: str def __init__(self, login, real_name, email): self.login = login self.real_name = real_name self.email = email def author(self) -> str: """ Return authorship information formatted for `git commit`. eg. 'A U Thor ' """ return f"{self.real_name} <{self.email}>" # ----------------------------------------------------------------------------- class Change(object): """ The base class for all change records in the Dokuwiki instance. This class stores all the required data fields, but subclasses must provide some required functionality; eg, path lookups, and history storage, are different for Pages and Media. """ timestamp: str ip: str operation: str page: str user: str description: str KNOWN_OPERATIONS = "CEeD" def __init__(self, timestamp, ip, operation, page, user, description): assert operation in Change.KNOWN_OPERATIONS, f"Operation '{operation}' is not handle" self.timestamp = timestamp self.ip = ip self.operation = operation self.page = page self.user = user self.description = description def __str__(self): return f'{self.user}@{self.timestamp}:{self.page} "{self.description}"' def _delete(self, _src: str, _dst: str) -> Generator[str, None, None]: """ Generates the required commands to delete a given page :param _src: The Dokuwiki instance the change belongs to :param _dst: The path to the root of the git repository. """ localpath = self.page.replace(':', os.path.sep) yield f'git rm --quiet "{localpath}.txt"' def apply(self, src, dst: str) -> Generator[str, None, None]: """ Generate a list of commands to enact this changeset from the Dokuwiki instance; eg, create, edit, or delete a page. The actual operation is not performed within this function. Instead, it's dispatched to (potentially overridden) member functions. Some basic configure time checks are performed; primarily for the existence of the required source files. :param src: The Dokuwiki instance this change comes from :param dst: The path to the root of the git repository. """ log.info(f"Applying {self.operation}: '{self.description}'") # If we're not trying to delete a page then ensure we can see the # source material in the Dokuwiki attic if self.operation not in 'D': attic = os.path.join(src.root, self._attic(src)) if not os.path.isfile(attic): log.error(f'Source file {attic} not present for {self}; ignoring') return # Find the function that will apply this update. Deliberately # construct this to throw if the operation isn't handled. func = { 'C': self._update, 'E': self._update, 'e': self._update, 'D': self._delete }[self.operation] # Generate the subcommands using the update function, but protect # against failure. for cmd in func(src, dst): yield f'{cmd} || die "{self.timestamp}"' # Finalise the update with a commit yield " ".join([ 'git commit', '--quiet', '--allow-empty', '--allow-empty-message', f'--author="{src.users[self.user].author()}"', f' --date="{self.timestamp} +0000"', f'-m "{self.description}"', f'|| die "{self.timestamp}"' ]) # ----------------------------------------------------------------------------- class PageChange(Change): """ Represents one change to one page in the Dokuwiki instance. """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def _attic(self, _src) -> str: """ Find the path to the history data within the Dokuwiki directories. :param _src: The Dokuwiki instance this change belongs to. :return: The relative path from the root of Dokuwiki to the compressed data. This is always gzip compressed. """ local = self.page.replace(':', os.path.sep) return os.path.join( 'data', 'attic', f'{local}.{self.timestamp}.txt.gz' ) def _update(self, src, dst: str) -> Generator[str, None, None]: """ Yield the commands required to unpack this change. It is assumed we don't need to remove old data and can just uncompress over the top of the old data. :param src: The Dokuwiki instance this change belongs to. :param dst: The absolute path to the root of the git repository :return: """ localpath = self.page.replace(':', os.path.sep) dstpath = os.path.join(dst, localpath) dstdir = os.path.dirname(dstpath) attic = os.path.join(src.root, self._attic(src)) cmds = [ f'mkdir -p "{dstdir}"', f'gunzip -c "{attic}" > "{dstpath}.txt"', f'git add "{localpath}".txt', ] for c in cmds: yield c # ----------------------------------------------------------------------------- class MediaChange(Change): """ Represents changes to one media file in a Dokuwiki instance. """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def _attic(self, src): """ Calculates the path to the source data required for this change. :param src: The Dokuwiki instance this change belongs to :return: The absolute to the data """ local = self.page.replace(':', os.path.sep) # If we're the most recent iteration of the file then we return a path # directly to the live media directory. # # Keep in mind that we search from the newest-to-oldest (due to the # reversed sorting in media list records). # # There are some situations where we get two edits with the same # timestamp; but we can't have additional entries in `media_attic` for # the same timestamp (and there doesn't appear to be an alternative # method to store these changes) # # We treat this as if they can be collapsed into one change for the # purposes of path lookups. changes = src.media[self.page] index = find_index(changes, self, lambda a, b: a.timestamp == b.timestamp) if index == 0: return os.path.join(src.root, 'data', 'media', local) # We're a historical entry, so we need to find the backup within # media_attic basename, extension = os.path.splitext(local) return os.path.join( src.root, 'data', 'media_attic', f'{basename}.{self.timestamp}{extension}' ) def _update(self, src: str, dst: str) -> Generator[str, None, None]: """ Yields the commands required to unpack this one change. :param src: The Dokuwiki instance this change belongs to. :param dst: The absolute path to the root of the git repository :return: """ localpath = self.page.replace(':', os.path.sep) localdir = os.path.dirname(localpath) dstpath = os.path.join(dst, localdir) cmds = [ f'cp "{self._attic(src)}" "{dstpath}"', f'git add "{dstpath}"' ] for c in cmds: yield c ############################################################################### class Dokuwiki: """ Represents data for a single Dokuwiki instance. It is not safe to use this on an installation that may be receiving updates; but it should be safe to use on a live site without active editors. Either way it will never rewrite any data. """ users: Dict[str, User] media: Dict[str, List[Change]] changes: List[Change] def _record_media(self, change): self.media[change.page].append(change) def __init__(self, root: str): self.root = root self.users = self._find_users() self.media = collections.defaultdict(list) page_generator = self._find_meta(os.path.join(self.root, 'data', 'meta'), PageChange) media_generator = self._find_meta(os.path.join(self.root, 'data', 'media_meta'), MediaChange) self.changes = list(page_generator) for entry in media_generator: self._record_media(entry) self.changes.append(entry) self.changes.sort(key=attrgetter('timestamp')) for k, v in self.media.items(): v.sort(key=attrgetter('timestamp'), reverse=True) def _find_users(self) -> Dict[str, User]: """ Parses the users.auth.php file to discover all listed users. :return: A mapping of login to User objects """ found = {} auth_path = os.path.join(self.root, 'conf', 'users.auth.php') with open(auth_path, 'r') as auth_file: for line in auth_file: # Make sure the line actually contains some data line = line.rstrip() if len(line) == 0: continue if line[0] == '#': continue login, password, real_name, email, groups, *tail = line.split(':') if len(tail) > 0: log.warning("Found extra components when parsing users.auth.php") found[login] = User(login=login, real_name=real_name, email=email) found[None] = User(login="system", real_name="system", email="system@mlug-au.org") return found def _find_meta(self, root: str, type): for dirpath, dirnames, filenames in os.walk(root): for file in filenames: # We don't really care about 'indexed' or 'meta' files. base, ext = os.path.splitext(file) if ext != '.changes': continue # Some paths are autogenerated listings of content that aren't # relevant to us (and drastically complicate processing) if base in ('_media', '_dokuwiki'): continue # Actually read the metadata path = os.path.join(dirpath, file) for change in self._read_meta(path, type): yield change def _read_meta(self, path: str, type): log.debug("extracting meta from %s", path) with open(path, 'r') as data: for entry in data: timestamp, ip, operation, page, user, description, *tail = entry.split('\t') if len(tail) > 1: log.warning("Dropping meta change data") else: assert tail[0] == '\n' if not user: if description != "external edit": raise RuntimeError("Empty user doesn't correspond to 'external edit'") user = None yield type( timestamp=timestamp, ip=ip, operation=operation, page=page, user=user, description=description ) ############################################################################### if __name__ == "__main__": import argparse import json header = [ '#!/usr/bin/env bash', 'die() { echo "$*" 1>&2 ; exit 1; }', ] def preamble(_src, dst: str): cmds = [ f'git init "{dst}"', f'cd "{dst}"', ] for c in cmds: yield c def finish(_src, _dst: str): arguments = [ f'git commit', ' --quiet', '--allow-empty', '--author="doku2git@mlug-au.org "', '-m "Converted dokuwiki to git"' ] yield " ".join(arguments) def main(): parser = argparse.ArgumentParser() parser.add_argument("--src", required=True, help="The root directory of the dokuwiki source") parser.add_argument("--dst", required=True, help="The git output directory") parser.add_argument("--users", type=str, help="A JSON serialised list of supplementary users") args = parser.parse_args() src = Dokuwiki(root=args.src) if args.users: with open(args.users, 'r') as users: for user in json.load(users): src.users[user['login']] = User(**user) todo = [ header, preamble(src, args.dst), *(change.apply(src, args.dst) for change in src.changes), finish(src, args.dst) ] for cmd in itertools.chain(*todo): print(cmd) main()