doku2git/doku2git.py

446 lines
14 KiB
Python
Executable File

#!/usr/bin/env python3
###############################################################################
from typing import List, Dict, Iterable, Callable, Generator
from operator import attrgetter
import abc
import os.path
import os
import collections
import itertools
###############################################################################
import logging
logging.basicConfig(level=logging.WARNING)
log = logging.getLogger()
###############################################################################
def find_index(haystack: Iterable, needle, comparator: Callable) -> int:
"""
Find the index of the first value within 'haystack' that compares as equal
to 'needle' using the supplied 'comparator'
"""
for index, data in zip(itertools.count(), haystack):
if comparator(data, needle):
return index
raise ValueError("Needle not found")
###############################################################################
class User(object):
"""
Represents a single user within the Dokuwiki namespace.
We deliberately do not store the password hash.
"""
login: str
real_name: str
email: str
def __init__(self, login, real_name, email):
self.login = login
self.real_name = real_name
self.email = email
def author(self) -> str:
"""
Return authorship information formatted for `git commit`.
eg. 'A U Thor <author@example.com>'
"""
return f"{self.real_name} <{self.email}>"
# -----------------------------------------------------------------------------
class Change(abc.ABC):
"""
The base class for all change records in the Dokuwiki instance.
This class stores all the required data fields, but subclasses must
provide some required functionality; eg, path lookups, and history
storage, are different for Pages and Media.
"""
timestamp: str
ip: str
operation: str
page: str
user: str
description: str
KNOWN_OPERATIONS = "CEeD"
def __init__(self, timestamp, ip, operation, page, user, description):
assert operation in Change.KNOWN_OPERATIONS, f"Operation '{operation}' is not handle"
self.timestamp = timestamp
self.ip = ip
self.operation = operation
self.page = page
self.user = user
self.description = description
def __str__(self):
return f'{self.user}@{self.timestamp}:{self.page} "{self.description}"'
@abc.abstractmethod
def _update(self, _src, _dst: str) -> Generator[str, None, None]:
"""
Yield the commands required to unpack this change.
:param _src: The Dokuwiki instance this change belongs to.
:param _dst: The absolute path to the root of the git repository
:return:
"""
pass
@abc.abstractmethod
def _attic(self, _src) -> str:
"""
Find the path to the history data within the Dokuwiki directories.
:param _src: The Dokuwiki instance this change belongs to.
:return: The relative path from the root of Dokuwiki to the data.
"""
pass
def _delete(self, _src: str, _dst: str) -> Generator[str, None, None]:
"""
Generates the required commands to delete a given page
:param _src: The Dokuwiki instance the change belongs to
:param _dst: The path to the root of the git repository.
"""
localpath = self.page.replace(':', os.path.sep)
yield f'git rm --quiet "{localpath}.txt"'
def apply(self, src, dst: str) -> Generator[str, None, None]:
"""
Generate a list of commands to enact this changeset from the Dokuwiki
instance; eg, create, edit, or delete a page.
The actual operation is not performed within this function. Instead, it's
dispatched to (potentially overridden) member functions.
Some basic configure time checks are performed; primarily for the
existence of the required source files.
:param src: The Dokuwiki instance this change comes from
:param dst: The path to the root of the git repository.
"""
log.info(f"Applying {self.operation}: '{self.description}'")
# If we're not trying to delete a page then ensure we can see the
# source material in the Dokuwiki attic
if self.operation not in 'D':
attic = os.path.join(src.root, self._attic(src))
if not os.path.isfile(attic):
log.error(f'Source file {attic} not present for {self}; ignoring')
return
# Find the function that will apply this update. Deliberately
# construct this to throw if the operation isn't handled.
func = {
'C': self._update,
'E': self._update,
'e': self._update,
'D': self._delete
}[self.operation]
# Generate the subcommands using the update function, but protect
# against failure.
for cmd in func(src, dst):
yield f'{cmd} || die "{self.timestamp}"'
# Finalise the update with a commit
yield " ".join([
'git commit',
'--quiet',
'--allow-empty',
'--allow-empty-message',
f'--author="{src.users[self.user].author()}"',
f' --date="{self.timestamp} +0000"',
f'-m "{self.description}"',
f'|| die "{self.timestamp}"'
])
# -----------------------------------------------------------------------------
class PageChange(Change):
"""
Represents one change to one page in the Dokuwiki instance.
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def _attic(self, _src) -> str:
"""
Find the path to the history data within the Dokuwiki directories.
:param _src: The Dokuwiki instance this change belongs to.
:return: The relative path from the root of Dokuwiki to the compressed
data. This is always gzip compressed.
"""
local = self.page.replace(':', os.path.sep)
return os.path.join(
'data',
'attic',
f'{local}.{self.timestamp}.txt.gz'
)
def _update(self, src, dst: str) -> Generator[str, None, None]:
"""
Yield the commands required to unpack this change.
It is assumed we don't need to remove old data and can just uncompress
over the top of the old data.
:param src: The Dokuwiki instance this change belongs to.
:param dst: The absolute path to the root of the git repository
:return:
"""
localpath = self.page.replace(':', os.path.sep)
dstpath = os.path.join(dst, localpath)
dstdir = os.path.dirname(dstpath)
attic = os.path.join(src.root, self._attic(src))
cmds = [
f'mkdir -p "{dstdir}"',
f'gunzip -c "{attic}" > "{dstpath}.txt"',
f'git add "{localpath}".txt',
]
for c in cmds:
yield c
# -----------------------------------------------------------------------------
class MediaChange(Change):
"""
Represents changes to one media file in a Dokuwiki instance.
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def _attic(self, src):
"""
Calculates the path to the source data required for this change.
:param src: The Dokuwiki instance this change belongs to
:return: The absolute to the data
"""
local = self.page.replace(':', os.path.sep)
# If we're the most recent iteration of the file then we return a path
# directly to the live media directory.
#
# Keep in mind that we search from the newest-to-oldest (due to the
# reversed sorting in media list records).
#
# There are some situations where we get two edits with the same
# timestamp; but we can't have additional entries in `media_attic` for
# the same timestamp (and there doesn't appear to be an alternative
# method to store these changes)
#
# We treat this as if they can be collapsed into one change for the
# purposes of path lookups.
changes = src.media[self.page]
index = find_index(changes, self, lambda a, b: a.timestamp == b.timestamp)
if index == 0:
return os.path.join(src.root, 'data', 'media', local)
# We're a historical entry, so we need to find the backup within
# media_attic
basename, extension = os.path.splitext(local)
return os.path.join(
src.root,
'data',
'media_attic',
f'{basename}.{self.timestamp}{extension}'
)
def _update(self, src, dst: str) -> Generator[str, None, None]:
"""
Yields the commands required to unpack this one change.
:param src: The Dokuwiki instance this change belongs to.
:param dst: The absolute path to the root of the git repository
:return:
"""
localpath = self.page.replace(':', os.path.sep)
localdir = os.path.dirname(localpath)
dstpath = os.path.join(dst, localdir)
cmds = [
f'cp "{self._attic(src)}" "{dstpath}"',
f'git add "{dstpath}"'
]
for c in cmds:
yield c
###############################################################################
class Dokuwiki:
"""
Represents data for a single Dokuwiki instance.
It is not safe to use this on an installation that may be receiving
updates; but it should be safe to use on a live site without active
editors. Either way it will never rewrite any data.
"""
users: Dict[str, User]
media: Dict[str, List[Change]]
changes: List[Change]
def _record_media(self, change):
self.media[change.page].append(change)
def __init__(self, root: str):
self.root = root
self.users = self._find_users()
self.media = collections.defaultdict(list)
page_generator = self._find_meta(os.path.join(self.root, 'data', 'meta'), PageChange)
media_generator = self._find_meta(os.path.join(self.root, 'data', 'media_meta'), MediaChange)
self.changes = list(page_generator)
for entry in media_generator:
self._record_media(entry)
self.changes.append(entry)
self.changes.sort(key=attrgetter('timestamp'))
for k, v in self.media.items():
v.sort(key=attrgetter('timestamp'), reverse=True)
def _find_users(self) -> Dict[str, User]:
"""
Parses the users.auth.php file to discover all listed users.
:return: A mapping of login to User objects
"""
found = {}
auth_path = os.path.join(self.root, 'conf', 'users.auth.php')
with open(auth_path, 'r') as auth_file:
for line in auth_file:
# Make sure the line actually contains some data
line = line.rstrip()
if len(line) == 0:
continue
if line[0] == '#':
continue
login, password, real_name, email, groups, *tail = line.split(':')
if len(tail) > 0:
log.warning("Found extra components when parsing users.auth.php")
found[login] = User(login=login, real_name=real_name, email=email)
found[None] = User(login="system", real_name="system", email="system@mlug-au.org")
return found
def _find_meta(self, root: str, klass):
for dirpath, dirnames, filenames in os.walk(root):
for file in filenames:
# We don't really care about 'indexed' or 'meta' files.
base, ext = os.path.splitext(file)
if ext != '.changes':
continue
# Some paths are autogenerated listings of content that aren't
# relevant to us (and drastically complicate processing)
if base in ('_media', '_dokuwiki'):
continue
# Actually read the metadata
path = os.path.join(dirpath, file)
for change in self._read_meta(path, klass):
yield change
def _read_meta(self, path: str, klass):
log.debug("extracting meta from %s", path)
with open(path, 'r') as data:
for entry in data:
timestamp, ip, operation, page, user, description, *tail = entry.split('\t')
if len(tail) > 1:
log.warning("Dropping meta change data")
else:
assert tail[0] == '\n'
if not user:
if description != "external edit":
raise RuntimeError("Empty user doesn't correspond to 'external edit'")
user = None
yield klass(
timestamp=timestamp,
ip=ip,
operation=operation,
page=page,
user=user,
description=description
)
###############################################################################
if __name__ == "__main__":
import argparse
import json
header = [
'#!/usr/bin/env bash',
'die() { echo "$*" 1>&2 ; exit 1; }',
]
def preamble(_src, dst: str):
cmds = [
f'git init "{dst}"',
f'cd "{dst}"',
]
for c in cmds:
yield c
def finish(_src, _dst: str):
arguments = [
f'git commit',
' --quiet',
'--allow-empty',
'--author="doku2git@mlug-au.org <doku2git>"',
'-m "Converted dokuwiki to git"'
]
yield " ".join(arguments)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--src", required=True, help="The root directory of the dokuwiki source")
parser.add_argument("--dst", required=True, help="The git output directory")
parser.add_argument("--users", type=str, help="A JSON serialised list of supplementary users")
args = parser.parse_args()
src = Dokuwiki(root=args.src)
if args.users:
with open(args.users, 'r') as users:
for user in json.load(users):
src.users[user['login']] = User(**user)
todo = [
header,
preamble(src, args.dst),
*(change.apply(src, args.dst) for change in src.changes),
finish(src, args.dst)
]
for cmd in itertools.chain(*todo):
print(cmd)
main()