aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBrian Harring <ferringb@google.com>2012-10-16 05:52:33 -0700
committerBrian Harring <ferringb@google.com>2012-10-16 13:28:49 -0700
commit7b996546be6ca923c978f6292187827340355fea (patch)
tree5d5269c4908da32febd6946b132c9b2731710235
parentadd --full mode; via this, the two phases can work together. faster namely. (diff)
downloadgit-conversion-tools-7b996546be6ca923c978f6292187827340355fea.tar.gz
git-conversion-tools-7b996546be6ca923c978f6292187827340355fea.tar.bz2
git-conversion-tools-7b996546be6ca923c978f6292187827340355fea.zip
abuse mmap to save on memory churn
-rwxr-xr-xrewrite-commit-dump.py51
1 files changed, 37 insertions, 14 deletions
diff --git a/rewrite-commit-dump.py b/rewrite-commit-dump.py
index 11264f4..53a1bae 100755
--- a/rewrite-commit-dump.py
+++ b/rewrite-commit-dump.py
@@ -1,6 +1,8 @@
#!/usr/bin/python
+import contextlib
import collections
import functools
+import mmap
import itertools
import operator
import os
@@ -8,6 +10,27 @@ import re
import sys
from collections import namedtuple
+@contextlib.contextmanager
+def mmap_open(path):
+ handle = fd = None
+ try:
+ fd = os.open(path, os.O_RDONLY)
+ handle = mmap.mmap(fd, os.fstat(fd).st_size, mmap.MAP_SHARED, mmap.PROT_READ)
+ os.close(fd)
+ fd = None
+ yield handle
+ finally:
+ if fd:
+ os.close(fd)
+ if handle:
+ handle.close()
+
+def readline_iterate(handle):
+ line = handle.readline()
+ while line:
+ yield line
+ line = handle.readline()
+
mangler = []
mangler.append(functools.partial(
re.compile(r"^\(paludis (0.1.*)\)$", re.M|re.I).sub,
@@ -198,9 +221,10 @@ def serialize_records(records, handle, target='refs/heads/master', progress=100)
raise AssertionError("serialize is out of sync; don't know field %s" % name)
write("\n")
-def deserialize_blob_map(source):
- source = (x.strip().split() for x in source)
- return dict((int(x[0].lstrip(':')), x[1]) for x in source)
+def deserialize_blob_map(path):
+ with mmap_open(path) as handle:
+ source = (x.strip().split() for x in readline_iterate(handle))
+ return dict((int(x[0].lstrip(':')), x[1]) for x in source)
def simple_dedup(records):
# dedup via timestamp/author/msg
@@ -272,12 +296,7 @@ def main(argv):
if not argv:
# See python manpage for details; stdin buffers if you iterate over it;
# we want each line as they're available, thus use this form.
- def source():
- line = sys.stdin.readline()
- while line:
- yield line
- line = sys.stdin.readline()
- source = source()
+ source = readline_iterate(sys.stdin)
for directory in source:
directory = directory.strip()
tmp = os.path.join(directory, 'cvs2svn-tmp')
@@ -286,12 +305,16 @@ def main(argv):
sys.stderr.write("skipping %s; no commit data\n" % directory)
sys.stderr.flush()
continue
- records.extend(manifest_dedup(
- deserialize_records(
- open(commits, 'r'),
- deserialize_blob_map(open(os.path.join(tmp, 'git-blob.idx'))))
+ with mmap_open(commits) as data:
+ records.extend(
+ manifest_dedup(
+ deserialize_records(data,
+ deserialize_blob_map(
+ os.path.join(tmp, 'git-blob.idx')
+ )
+ )
+ )
)
- )
sorter = operator.attrgetter('timestamp')
# Get them into timestamp ordering first; this is abusing python stable
# sort pretty much since any commits to the same repo w/ the same timestamp