Created
December 20, 2017 15:55
-
-
Save dutchiechris/1da2f8786107a635e7e6f5a8c50e84dc to your computer and use it in GitHub Desktop.
Google Cloud Platform Big Query script to copy data from one region to another
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
# | |
# Google Cloud Platform Big Query script to copy data from one region to another | |
# | |
# Disclaimer: This is not an officially supported Google product. | |
# Written code can be used as a baseline but is not meant for production usage. | |
# Project that owns source and destination datasets | |
PROJECT=my-project | |
# Source bucket and dataset must be in the same region | |
SOURCE_DATASET=source_data | |
SOURCE_BUCKET=gs://source-bucket/bqexport | |
# Destination bucket and dataset must be in the same region | |
DEST_DATASET=dest_data | |
DEST_BUCKET=gs://dest-bucket/bqexport | |
### | |
### Main script: | |
### | |
TMPDIR=$(mktemp -d) | |
## Extract | |
for f in `bq --project_id=$PROJECT ls --max_results=100000 $SOURCE_DATASET | grep TABLE | awk '{print $1}'`; do | |
echo "**** Saving Schema from $f to $TMPDIR/schema-$f.json" | |
bq show --format=prettyjson "$PROJECT_ID:$SOURCE_DATASET.$f" | jq '.schema.fields' > $TMPDIR/schema-$f.json | |
echo "**** Extracting $f to $SOURCE_BUCKET/$f.json" | |
bq extract --destination_format=NEWLINE_DELIMITED_JSON "$PROJECT_ID:$SOURCE_DATASET.$f" $SOURCE_BUCKET/$f.json | |
done | |
## Copy | |
echo "**** Copying data from $SOURCE_BUCKET/*.json to $DEST_BUCKET/" | |
gsutil -m cp $SOURCE_BUCKET/*.json $DEST_BUCKET/ | |
## Load | |
for f in `bq --project_id=$PROJECT ls --max_results=100000 $SOURCE_DATASET | grep TABLE | awk '{print $1}'`; do | |
echo "**** Loading $f to $DEST_BUCKET/$f.json with schema $TMPDIR/schema-$f.json" | |
bq --nosync load --replace --source_format=NEWLINE_DELIMITED_JSON "$PROJECT_ID:$DEST_DATASET.$f" $DEST_BUCKET/$f.json $TMPDIR/schema-$f.json | |
done | |
echo | |
echo "**** Check the load job status for each table above and remove $SOURCE_BUCKET and $DEST_BUCKET when the background loads are completed" | |
rm -rf $TMPDIR |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment