Last active
August 29, 2015 14:02
-
-
Save hannahwhy/1bdca9cc4235416a3786 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/doc/terse_options.rst b/doc/terse_options.rst | |
index 725ae72..b223979 100644 | |
--- a/doc/terse_options.rst | |
+++ b/doc/terse_options.rst | |
@@ -35,6 +35,7 @@ Brief Option Overview | |
[--random-file FILE] [--edg-file FILE] | |
[--warc-file FILENAME] [--warc-append] | |
[--warc-header STRING] [--warc-max-size NUMBER] | |
+ [--move-warc-to DIR] | |
[--warc-cdx] [--warc-dedup FILE] [--no-warc-compression] | |
[--no-warc-digests] [--no-warc-keep-log] | |
[--warc-tempdir DIRECTORY] [-r] [-l NUMBER] | |
@@ -196,6 +197,8 @@ Brief Option Overview | |
--warc-header STRING include STRING in WARC file metadata | |
--warc-max-size NUMBER | |
write sequential WARC files sized about NUMBER bytes | |
+ --move-warc-to DIR once a sequential WARC file has reached its max size, | |
+ move it to DIR | |
--warc-cdx write CDX file along with the WARC file | |
--warc-dedup FILE write revisit records using digests in FILE | |
--no-warc-compression | |
diff --git a/wpull/options.py b/wpull/options.py | |
index 57af72b..f75483d 100644 | |
--- a/wpull/options.py | |
+++ b/wpull/options.py | |
@@ -891,6 +891,12 @@ class AppArgumentParser(argparse.ArgumentParser): | |
help=_('write sequential WARC files sized about NUMBER bytes') | |
) | |
group.add_argument( | |
+ '--move-warc-to', | |
+ metavar='DIRECTORY', | |
+ default=None, | |
+ help=_('once a sequential WARC file has reached its max size, move it to DIRECTORY') | |
+ ) | |
+ group.add_argument( | |
'--warc-cdx', | |
action='store_true', | |
help=_('write CDX file along with the WARC file') | |
diff --git a/wpull/recorder.py b/wpull/recorder.py | |
index 88a9991..c8bcdf5 100644 | |
--- a/wpull/recorder.py | |
+++ b/wpull/recorder.py | |
@@ -11,6 +11,7 @@ import logging | |
import os.path | |
import re | |
import sys | |
+import shutil | |
from tempfile import NamedTemporaryFile | |
import tempfile | |
import time | |
@@ -140,6 +141,7 @@ WARCRecorderParams = namedlist.namedtuple( | |
('digests', True), | |
('cdx', None), | |
('max_size', None), | |
+ ('move_to', None), | |
('url_table', None), | |
('software_string', None) | |
] | |
@@ -157,6 +159,8 @@ Args: | |
cdx (bool): If True, a CDX file will be written. | |
max_size (int): If provided, output files are named like | |
``name-00000.ext`` and the log file will be in ``name-meta.ext``. | |
+ move_to (str): If provided, completed sequential WARCs will be moved | |
+ to the given directory | |
url_table (:class:`.database.URLTable`): If given, then ``revist`` | |
records will be written. | |
software_string (str): The value for the ``software`` field in the | |
@@ -287,6 +291,13 @@ class WARCRecorder(BaseRecorder): | |
_logger.debug('Starting new warc file due to max size.') | |
self._start_new_warc_file() | |
+ if self._params.move_to is not None: | |
+ if os.path.isdir(self._params.move_to): | |
+ shutil.move(self._warc_filename, self.params.move_to) | |
+ else: | |
+ _logger.error('%s is not a directory; not moving %s.' % ( | |
+ self._params.move_to, self._warc_filename)) | |
+ | |
def set_length_and_maybe_checksums(self, record, payload_offset=None): | |
'''Set the content length and possibly the checksums.''' | |
if self._params.digests: |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment