2019-07-07 13:57:02 +10:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
|
|
|
|
###############################################################################
|
2019-07-07 14:35:44 +10:00
|
|
|
from typing import List, Dict, Iterable, Callable, Generator
|
2019-07-07 13:57:02 +10:00
|
|
|
from operator import attrgetter
|
|
|
|
|
|
|
|
import os.path
|
|
|
|
import os
|
|
|
|
import collections
|
|
|
|
import itertools
|
|
|
|
|
|
|
|
|
|
|
|
###############################################################################
|
|
|
|
import logging
|
|
|
|
|
2019-07-07 14:34:51 +10:00
|
|
|
logging.basicConfig(level=logging.WARNING)
|
2019-07-07 13:57:02 +10:00
|
|
|
log = logging.getLogger()
|
|
|
|
|
|
|
|
|
|
|
|
###############################################################################
|
|
|
|
def find_index(haystack: Iterable, needle, comparator: Callable) -> int:
|
|
|
|
"""
|
|
|
|
Find the index of the first value within 'haystack' that compares as equal
|
|
|
|
to 'needle' using the supplied 'comparator'
|
|
|
|
"""
|
|
|
|
for index, data in zip(itertools.count(), haystack):
|
|
|
|
if comparator(data, needle):
|
|
|
|
return index
|
|
|
|
|
|
|
|
raise ValueError("Needle not found")
|
|
|
|
|
|
|
|
|
|
|
|
###############################################################################
|
|
|
|
class User(object):
|
|
|
|
"""
|
|
|
|
Represents a single user within the Dokuwiki namespace.
|
|
|
|
|
|
|
|
We deliberately do not store the password hash.
|
|
|
|
"""
|
|
|
|
|
|
|
|
login: str
|
|
|
|
real_name: str
|
|
|
|
email: str
|
|
|
|
|
|
|
|
def __init__(self, login, real_name, email):
|
|
|
|
self.login = login
|
|
|
|
self.real_name = real_name
|
|
|
|
self.email = email
|
|
|
|
|
|
|
|
def author(self) -> str:
|
|
|
|
"""
|
|
|
|
Return authorship information formatted for `git commit`.
|
|
|
|
eg. 'A U Thor <author@example.com>'
|
|
|
|
"""
|
|
|
|
return f"{self.real_name} <{self.email}>"
|
|
|
|
|
|
|
|
|
2019-07-07 14:36:29 +10:00
|
|
|
# -----------------------------------------------------------------------------
|
2019-07-07 13:57:02 +10:00
|
|
|
class Change(object):
|
|
|
|
"""
|
|
|
|
The base class for all change records in the Dokuwiki instance.
|
|
|
|
|
|
|
|
This class stores all the required data fields, but subclasses must
|
|
|
|
provide some required functionality; eg, path lookups, and history
|
|
|
|
storage, are different for Pages and Media.
|
|
|
|
"""
|
|
|
|
|
|
|
|
timestamp: str
|
|
|
|
ip: str
|
|
|
|
operation: str
|
|
|
|
page: str
|
|
|
|
user: str
|
|
|
|
description: str
|
|
|
|
|
|
|
|
KNOWN_OPERATIONS = "CEeD"
|
|
|
|
|
|
|
|
def __init__(self, timestamp, ip, operation, page, user, description):
|
|
|
|
assert operation in Change.KNOWN_OPERATIONS, f"Operation '{operation}' is not handle"
|
|
|
|
|
|
|
|
self.timestamp = timestamp
|
|
|
|
self.ip = ip
|
|
|
|
self.operation = operation
|
|
|
|
self.page = page
|
|
|
|
self.user = user
|
|
|
|
self.description = description
|
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
return f'{self.user}@{self.timestamp}:{self.page} "{self.description}"'
|
|
|
|
|
2019-07-07 14:37:09 +10:00
|
|
|
def _delete(self, _src: str, _dst: str) -> Generator[str, None, None]:
|
2019-07-07 13:57:02 +10:00
|
|
|
"""
|
|
|
|
Generates the required commands to delete a given page
|
2019-07-07 14:37:09 +10:00
|
|
|
:param _src: The Dokuwiki instance the change belongs to
|
|
|
|
:param _dst: The path to the root of the git repository.
|
2019-07-07 13:57:02 +10:00
|
|
|
"""
|
|
|
|
localpath = self.page.replace(':', os.path.sep)
|
|
|
|
yield f'git rm --quiet "{localpath}.txt"'
|
|
|
|
|
|
|
|
def apply(self, src, dst: str) -> Generator[str, None, None]:
|
|
|
|
"""
|
|
|
|
Generate a list of commands to enact this changeset from the Dokuwiki
|
|
|
|
instance; eg, create, edit, or delete a page.
|
|
|
|
|
|
|
|
The actual operation is not performed within this function. Instead, it's
|
|
|
|
dispatched to (potentially overridden) member functions.
|
|
|
|
|
|
|
|
Some basic configure time checks are performed; primarily for the
|
|
|
|
existence of the required source files.
|
|
|
|
|
|
|
|
:param src: The Dokuwiki instance this change comes from
|
|
|
|
:param dst: The path to the root of the git repository.
|
|
|
|
"""
|
|
|
|
log.info(f"Applying {self.operation}: '{self.description}'")
|
|
|
|
|
|
|
|
# If we're not trying to delete a page then ensure we can see the
|
|
|
|
# source material in the Dokuwiki attic
|
2019-07-07 14:37:44 +10:00
|
|
|
if self.operation not in 'D':
|
2019-07-07 13:57:02 +10:00
|
|
|
attic = os.path.join(src.root, self._attic(src))
|
|
|
|
if not os.path.isfile(attic):
|
|
|
|
log.error(f'Source file {attic} not present for {self}; ignoring')
|
|
|
|
return
|
|
|
|
|
|
|
|
# Find the function that will apply this update. Deliberately
|
|
|
|
# construct this to throw if the operation isn't handled.
|
|
|
|
func = {
|
|
|
|
'C': self._update,
|
|
|
|
'E': self._update,
|
|
|
|
'e': self._update,
|
|
|
|
'D': self._delete
|
|
|
|
}[self.operation]
|
|
|
|
|
|
|
|
# Generate the subcommands using the update function, but protect
|
|
|
|
# against failure.
|
|
|
|
for cmd in func(src, dst):
|
|
|
|
yield f'{cmd} || die "{self.timestamp}"'
|
|
|
|
|
|
|
|
# Finalise the update with a commit
|
|
|
|
yield " ".join([
|
|
|
|
'git commit',
|
|
|
|
'--quiet',
|
|
|
|
'--allow-empty',
|
|
|
|
'--allow-empty-message',
|
|
|
|
f'--author="{src.users[self.user].author()}"',
|
|
|
|
f' --date="{self.timestamp} +0000"',
|
|
|
|
f'-m "{self.description}"',
|
|
|
|
f'|| die "{self.timestamp}"'
|
|
|
|
])
|
|
|
|
|
|
|
|
|
2019-07-07 14:36:29 +10:00
|
|
|
# -----------------------------------------------------------------------------
|
2019-07-07 13:57:02 +10:00
|
|
|
class PageChange(Change):
|
|
|
|
"""
|
|
|
|
Represents one change to one page in the Dokuwiki instance.
|
|
|
|
"""
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
|
|
super().__init__(*args, **kwargs)
|
|
|
|
|
2019-07-07 14:37:09 +10:00
|
|
|
def _attic(self, _src) -> str:
|
2019-07-07 13:57:02 +10:00
|
|
|
"""
|
|
|
|
Find the path to the history data within the Dokuwiki directories.
|
2019-07-07 14:37:09 +10:00
|
|
|
:param _src: The Dokuwiki instance this change belongs to.
|
2019-07-07 13:57:02 +10:00
|
|
|
:return: The relative path from the root of Dokuwiki to the compressed
|
|
|
|
data. This is always gzip compressed.
|
|
|
|
"""
|
|
|
|
local = self.page.replace(':', os.path.sep)
|
|
|
|
|
|
|
|
return os.path.join(
|
|
|
|
'data',
|
|
|
|
'attic',
|
|
|
|
f'{local}.{self.timestamp}.txt.gz'
|
|
|
|
)
|
|
|
|
|
|
|
|
def _update(self, src, dst: str) -> Generator[str, None, None]:
|
|
|
|
"""
|
|
|
|
Yield the commands required to unpack this change.
|
|
|
|
|
|
|
|
It is assumed we don't need to remove old data and can just uncompress
|
|
|
|
over the top of the old data.
|
|
|
|
:param src: The Dokuwiki instance this change belongs to.
|
|
|
|
:param dst: The absolute path to the root of the git repository
|
|
|
|
:return:
|
|
|
|
"""
|
|
|
|
localpath = self.page.replace(':', os.path.sep)
|
|
|
|
|
|
|
|
dstpath = os.path.join(dst, localpath)
|
|
|
|
dstdir = os.path.dirname(dstpath)
|
|
|
|
|
|
|
|
attic = os.path.join(src.root, self._attic(src))
|
|
|
|
|
|
|
|
cmds = [
|
|
|
|
f'mkdir -p "{dstdir}"',
|
|
|
|
f'gunzip -c "{attic}" > "{dstpath}.txt"',
|
|
|
|
f'git add "{localpath}".txt',
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
for c in cmds:
|
|
|
|
yield c
|
|
|
|
|
|
|
|
|
2019-07-07 14:36:29 +10:00
|
|
|
# -----------------------------------------------------------------------------
|
2019-07-07 13:57:02 +10:00
|
|
|
class MediaChange(Change):
|
|
|
|
"""
|
|
|
|
Represents changes to one media file in a Dokuwiki instance.
|
|
|
|
"""
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
|
|
super().__init__(*args, **kwargs)
|
|
|
|
|
|
|
|
def _attic(self, src):
|
|
|
|
"""
|
|
|
|
Calculates the path to the source data required for this change.
|
|
|
|
:param src: The Dokuwiki instance this change belongs to
|
|
|
|
:return: The absolute to the data
|
|
|
|
"""
|
|
|
|
local = self.page.replace(':', os.path.sep)
|
|
|
|
|
|
|
|
# If we're the most recent iteration of the file then we return a path
|
|
|
|
# directly to the live media directory.
|
|
|
|
#
|
|
|
|
# Keep in mind that we search from the newest-to-oldest (due to the
|
|
|
|
# reversed sorting in media list records).
|
|
|
|
#
|
|
|
|
# There are some situations where we get two edits with the same
|
|
|
|
# timestamp; but we can't have additional entries in `media_attic` for
|
|
|
|
# the same timestamp (and there doesn't appear to be an alternative
|
|
|
|
# method to store these changes)
|
|
|
|
#
|
|
|
|
# We treat this as if they can be collapsed into one change for the
|
|
|
|
# purposes of path lookups.
|
|
|
|
changes = src.media[self.page]
|
|
|
|
index = find_index(changes, self, lambda a, b: a.timestamp == b.timestamp)
|
|
|
|
if index == 0:
|
|
|
|
return os.path.join(src.root, 'data', 'media', local)
|
|
|
|
|
|
|
|
# We're a historical entry, so we need to find the backup within
|
|
|
|
# media_attic
|
|
|
|
basename, extension = os.path.splitext(local)
|
|
|
|
return os.path.join(
|
|
|
|
src.root,
|
|
|
|
'data',
|
|
|
|
'media_attic',
|
|
|
|
f'{basename}.{self.timestamp}{extension}'
|
|
|
|
)
|
|
|
|
|
|
|
|
def _update(self, src: str, dst: str) -> Generator[str, None, None]:
|
|
|
|
"""
|
|
|
|
Yields the commands required to unpack this one change.
|
|
|
|
:param src: The Dokuwiki instance this change belongs to.
|
|
|
|
:param dst: The absolute path to the root of the git repository
|
|
|
|
:return:
|
|
|
|
"""
|
|
|
|
localpath = self.page.replace(':', os.path.sep)
|
|
|
|
localdir = os.path.dirname(localpath)
|
|
|
|
|
2019-07-07 14:23:03 +10:00
|
|
|
dstpath = os.path.join(dst, localdir)
|
|
|
|
|
|
|
|
cmds = [
|
|
|
|
f'cp "{self._attic(src)}" "{dstpath}"',
|
|
|
|
f'git add "{dstpath}"'
|
|
|
|
]
|
|
|
|
|
|
|
|
for c in cmds:
|
|
|
|
yield c
|
2019-07-07 13:57:02 +10:00
|
|
|
|
|
|
|
|
|
|
|
###############################################################################
|
|
|
|
class Dokuwiki:
|
|
|
|
"""
|
|
|
|
Represents data for a single Dokuwiki instance.
|
|
|
|
|
|
|
|
It is not safe to use this on an installation that may be receiving
|
|
|
|
updates; but it should be safe to use on a live site without active
|
|
|
|
editors. Either way it will never rewrite any data.
|
|
|
|
"""
|
|
|
|
|
2019-07-07 14:35:44 +10:00
|
|
|
users: Dict[str, User]
|
|
|
|
media: Dict[str, List[Change]]
|
2019-07-07 13:57:02 +10:00
|
|
|
changes: List[Change]
|
|
|
|
|
|
|
|
def _record_media(self, change):
|
|
|
|
self.media[change.page].append(change)
|
|
|
|
|
|
|
|
def __init__(self, root: str):
|
|
|
|
self.root = root
|
|
|
|
self.users = self._find_users()
|
|
|
|
self.media = collections.defaultdict(list)
|
|
|
|
|
2019-07-07 14:34:51 +10:00
|
|
|
page_generator = self._find_meta(os.path.join(self.root, 'data', 'meta'), PageChange)
|
|
|
|
media_generator = self._find_meta(os.path.join(self.root, 'data', 'media_meta'), MediaChange)
|
2019-07-07 13:57:02 +10:00
|
|
|
|
|
|
|
self.changes = list(page_generator)
|
|
|
|
for entry in media_generator:
|
|
|
|
self._record_media(entry)
|
|
|
|
self.changes.append(entry)
|
|
|
|
|
|
|
|
self.changes.sort(key=attrgetter('timestamp'))
|
2019-07-07 14:34:51 +10:00
|
|
|
for k, v in self.media.items():
|
2019-07-07 13:57:02 +10:00
|
|
|
v.sort(key=attrgetter('timestamp'), reverse=True)
|
|
|
|
|
2019-07-07 14:35:44 +10:00
|
|
|
def _find_users(self) -> Dict[str, User]:
|
2019-07-07 13:57:02 +10:00
|
|
|
"""
|
|
|
|
Parses the users.auth.php file to discover all listed users.
|
|
|
|
:return: A mapping of login to User objects
|
|
|
|
"""
|
|
|
|
found = {}
|
|
|
|
|
|
|
|
auth_path = os.path.join(self.root, 'conf', 'users.auth.php')
|
|
|
|
with open(auth_path, 'r') as auth_file:
|
|
|
|
for line in auth_file:
|
|
|
|
# Make sure the line actually contains some data
|
|
|
|
line = line.rstrip()
|
|
|
|
if len(line) == 0:
|
|
|
|
continue
|
|
|
|
if line[0] == '#':
|
|
|
|
continue
|
|
|
|
|
|
|
|
login, password, real_name, email, groups, *tail = line.split(':')
|
|
|
|
if len(tail) > 0:
|
|
|
|
log.warning("Found extra components when parsing users.auth.php")
|
|
|
|
|
|
|
|
found[login] = User(login=login, real_name=real_name, email=email)
|
|
|
|
|
|
|
|
found[None] = User(login="system", real_name="system", email="system@mlug-au.org")
|
|
|
|
return found
|
|
|
|
|
|
|
|
def _find_meta(self, root: str, type):
|
|
|
|
for dirpath, dirnames, filenames in os.walk(root):
|
|
|
|
for file in filenames:
|
|
|
|
# We don't really care about 'indexed' or 'meta' files.
|
|
|
|
base, ext = os.path.splitext(file)
|
|
|
|
if ext != '.changes':
|
|
|
|
continue
|
|
|
|
|
|
|
|
# Some paths are autogenerated listings of content that aren't
|
|
|
|
# relevant to us (and drastically complicate processing)
|
|
|
|
if base in ('_media', '_dokuwiki'):
|
|
|
|
continue
|
|
|
|
|
|
|
|
# Actually read the metadata
|
|
|
|
path = os.path.join(dirpath, file)
|
|
|
|
for change in self._read_meta(path, type):
|
|
|
|
yield change
|
|
|
|
|
|
|
|
def _read_meta(self, path: str, type):
|
|
|
|
log.debug("extracting meta from %s", path)
|
|
|
|
with open(path, 'r') as data:
|
|
|
|
for entry in data:
|
|
|
|
timestamp, ip, operation, page, user, description, *tail = entry.split('\t')
|
|
|
|
|
|
|
|
if len(tail) > 1:
|
|
|
|
log.warning("Dropping meta change data")
|
|
|
|
else:
|
|
|
|
assert tail[0] == '\n'
|
|
|
|
|
|
|
|
if not user:
|
|
|
|
if description != "external edit":
|
|
|
|
raise RuntimeError("Empty user doesn't correspond to 'external edit'")
|
|
|
|
user = None
|
|
|
|
|
|
|
|
yield type(
|
|
|
|
timestamp=timestamp,
|
|
|
|
ip=ip,
|
|
|
|
operation=operation,
|
|
|
|
page=page,
|
|
|
|
user=user,
|
|
|
|
description=description
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
###############################################################################
|
|
|
|
if __name__ == "__main__":
|
|
|
|
import argparse
|
|
|
|
import json
|
|
|
|
|
|
|
|
header = [
|
|
|
|
'#!/usr/bin/env bash',
|
|
|
|
'die() { echo "$*" 1>&2 ; exit 1; }',
|
|
|
|
]
|
|
|
|
|
2019-07-07 14:37:09 +10:00
|
|
|
def preamble(_src, dst: str):
|
2019-07-07 13:57:02 +10:00
|
|
|
cmds = [
|
|
|
|
f'git init "{dst}"',
|
|
|
|
f'cd "{dst}"',
|
|
|
|
]
|
|
|
|
|
|
|
|
for c in cmds:
|
|
|
|
yield c
|
|
|
|
|
2019-07-07 14:37:09 +10:00
|
|
|
def finish(_src, _dst: str):
|
2019-07-07 13:57:02 +10:00
|
|
|
arguments = [
|
|
|
|
f'git commit',
|
|
|
|
' --quiet',
|
|
|
|
'--allow-empty',
|
|
|
|
'--author="doku2git@mlug-au.org <doku2git>"',
|
|
|
|
'-m "Converted dokuwiki to git"'
|
|
|
|
]
|
|
|
|
yield " ".join(arguments)
|
|
|
|
|
|
|
|
def main():
|
|
|
|
parser = argparse.ArgumentParser()
|
|
|
|
parser.add_argument("--src", required=True, help="The root directory of the dokuwiki source")
|
|
|
|
parser.add_argument("--dst", required=True, help="The git output directory")
|
|
|
|
parser.add_argument("--users", type=str, help="A JSON serialised list of supplementary users")
|
|
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
src = Dokuwiki(root=args.src)
|
|
|
|
|
|
|
|
if args.users:
|
|
|
|
with open(args.users, 'r') as users:
|
|
|
|
for user in json.load(users):
|
|
|
|
src.users[user['login']] = User(**user)
|
|
|
|
|
|
|
|
todo = [
|
|
|
|
header,
|
|
|
|
preamble(src, args.dst),
|
|
|
|
*(change.apply(src, args.dst) for change in src.changes),
|
|
|
|
finish(src, args.dst)
|
|
|
|
]
|
|
|
|
|
|
|
|
for cmd in itertools.chain(*todo):
|
|
|
|
print(cmd)
|
|
|
|
|
|
|
|
main()
|