cat abuse with split

# cat gets used all the time just to pipe the contents of a
# file to stdout.
# But it is actually for concatenating files.
# It has a partner in crime called split.
# Working together they are very powerful for parallel processing.
# split - do work in parallel - cat

# split then cat will produce out output file which is identical to the input:

# Make a 1 gig file of random bytes on my external ssd.
time head -c $(( 1024 * 1024 * 1024 )) /dev/urandom > $(mktemp '/DataSwap/big.XXXXXXXX')

real    0m5.738s
user    0m0.057s
sys    0m5.680s


# Yey - that was fast!
[aturner@Alexanders-MBP ~]$ split -b $(( 1024 * 1024)) /DataSwap/big.* '/DataSwap/parts'
[aturner@Alexanders-MBP ~]$ ls /DataSwap/parts*
/DataSwap/partsaa  /DataSwap/partsgp  /DataSwap/partsne  /DataSwap/partstt    /DataSwap/partszabi  /DataSwap/partszahx
/DataSwap/partsab  /DataSwap/partsgq  /DataSwap/partsnf  /DataSwap/partstu    /DataSwap/partszabj  /DataSwap/partszahy
/DataSwap/partsac  /DataSwap/partsgr  /DataSwap/partsng  /DataSwap/partstv    /DataSwap/partszabk  /DataSwap/partszahz

# ... stuff not shown as there are 1024 files!

# cat them back together - note how split creates file names
# such that ls automatically creates the correct ordering.
time cat $(ls /DataSwap/parts*) > /DataSwap/Rebuild

real    0m1.949s
user    0m0.017s
sys    0m0.802s
 

# Again - time is not too shabby (over 500 mega bytes per second)!
cmp /DataSwap/big.oTFSbZnD /DataSwap/Rebuild 
[[ $? == 0 ]] && print Yey
Yey

 

Comments

Popular posts from this blog

Bithon: Run Python Interactively Inside Bash

oche, lik echo but a bit easier to use.

Parsing Columns From Files WITHOUT awk