| 1 | #!/usr/bin/env bash
|
| 2 | #
|
| 3 | # Test the size of file, encoding, and decoding speed.
|
| 4 | #
|
| 5 | # Usage:
|
| 6 | # ./oheap.sh <function name>
|
| 7 |
|
| 8 | set -o nounset
|
| 9 | set -o pipefail
|
| 10 | set -o errexit
|
| 11 |
|
| 12 | source test/common.sh
|
| 13 | source benchmarks/common.sh
|
| 14 |
|
| 15 | readonly BASE_DIR=_tmp/oheap
|
| 16 |
|
| 17 | encode-one() {
|
| 18 | local script=$1
|
| 19 | local oheap_out=$2
|
| 20 | $OSH_OVM -n --ast-format oheap "$script" > $oheap_out
|
| 21 | }
|
| 22 |
|
| 23 | task-spec() {
|
| 24 | while read path; do
|
| 25 | echo "$path _tmp/oheap/$(basename $path)__oheap"
|
| 26 | done < benchmarks/osh-parser-files.txt
|
| 27 | }
|
| 28 |
|
| 29 | encode-all() {
|
| 30 | mkdir -p _tmp/oheap
|
| 31 |
|
| 32 | local times_csv=_tmp/oheap/times.csv
|
| 33 | echo 'status,elapsed_secs' > $times_csv
|
| 34 |
|
| 35 | task-spec | xargs -n 2 --verbose -- \
|
| 36 | benchmarks/time.py --output $times_csv -- \
|
| 37 | $0 encode-one
|
| 38 | }
|
| 39 |
|
| 40 | # Out of curiosity, compress oheap and originals.
|
| 41 |
|
| 42 | compress-oheap() {
|
| 43 | local c_dir=$BASE_DIR/oheap-compressed
|
| 44 | mkdir -p $c_dir
|
| 45 | for bin in _tmp/oheap/*__oheap; do
|
| 46 | local name=$(basename $bin)
|
| 47 | log "Compressing $name"
|
| 48 | gzip --stdout $bin > $c_dir/$name.gz
|
| 49 | xz --stdout $bin > $c_dir/$name.xz
|
| 50 | done
|
| 51 | }
|
| 52 |
|
| 53 | compress-text() {
|
| 54 | local c_dir=$BASE_DIR/src-compressed
|
| 55 | mkdir -p $c_dir
|
| 56 |
|
| 57 | while read src; do
|
| 58 | local name=$(basename $src)
|
| 59 | log "Compressing $name"
|
| 60 | gzip --stdout $src > $c_dir/${name}__text.gz
|
| 61 | xz --stdout $src > $c_dir/${name}__text.xz
|
| 62 | done < benchmarks/osh-parser-files.txt
|
| 63 | }
|
| 64 |
|
| 65 | print-size() {
|
| 66 | local c1=$1
|
| 67 | local c2=$2
|
| 68 | shift 2
|
| 69 |
|
| 70 | # depth 0: just the filename itself.
|
| 71 | find "$@" -maxdepth 0 -printf "%s,$c1,$c2,%p\n"
|
| 72 | }
|
| 73 |
|
| 74 | print-csv() {
|
| 75 | echo 'num_bytes,format,compression,path'
|
| 76 | # TODO
|
| 77 | print-size text none benchmarks/testdata/*
|
| 78 | print-size text gz $BASE_DIR/src-compressed/*.gz
|
| 79 | print-size text xz $BASE_DIR/src-compressed/*.xz
|
| 80 |
|
| 81 | print-size oheap none $BASE_DIR/*__oheap
|
| 82 | print-size oheap gz $BASE_DIR/oheap-compressed/*.gz
|
| 83 | print-size oheap xz $BASE_DIR/oheap-compressed/*.xz
|
| 84 | }
|
| 85 |
|
| 86 | # This can be done on any host.
|
| 87 | measure() {
|
| 88 | encode-all
|
| 89 | compress-oheap
|
| 90 | compress-text
|
| 91 | }
|
| 92 |
|
| 93 | stage1() {
|
| 94 | local out_dir=$BASE_DIR/stage1
|
| 95 | mkdir -p $out_dir
|
| 96 | print-csv > $out_dir/sizes.csv
|
| 97 | }
|
| 98 |
|
| 99 | print-report() {
|
| 100 | local in_dir=$1
|
| 101 | local base_url='../../web'
|
| 102 |
|
| 103 | cat <<EOF
|
| 104 | <!DOCTYPE html>
|
| 105 | <html>
|
| 106 | <head>
|
| 107 | <title>OHeap Encoding</title>
|
| 108 | <script type="text/javascript" src="$base_url/table/table-sort.js"></script>
|
| 109 | <link rel="stylesheet" type="text/css" href="$base_url/table/table-sort.css" />
|
| 110 | <link rel="stylesheet" type="text/css" href="$base_url/benchmarks.css" />
|
| 111 |
|
| 112 | </head>
|
| 113 | <body>
|
| 114 | <p id="home-link">
|
| 115 | <a href="/">oilshell.org</a>
|
| 116 | </p>
|
| 117 | <h2>OHeap Encoding</h2>
|
| 118 |
|
| 119 | <h3>Encoding Size (KB)</h3>
|
| 120 |
|
| 121 | <p>Sizes are in KB (powers of 10), not KiB (powers of 2).</p>
|
| 122 | EOF
|
| 123 | csv2html $in_dir/encoding_size.csv
|
| 124 |
|
| 125 | cat <<EOF
|
| 126 | <h3>Encoding Ratios</h3>
|
| 127 | EOF
|
| 128 | csv2html $in_dir/encoding_ratios.csv
|
| 129 |
|
| 130 | cat <<EOF
|
| 131 | </body>
|
| 132 | </html>
|
| 133 | EOF
|
| 134 | }
|
| 135 |
|
| 136 |
|
| 137 | # TODO: instead of running osh_demo, we should generate a C++ program that
|
| 138 | # visits every node and counts it. The output might look like:
|
| 139 | #
|
| 140 | # - It can also print out the depth of the tree.
|
| 141 | # - Summary: number of different types used
|
| 142 | # - another option: decode/validate utf-8. See Visitor Use Cases.
|
| 143 | #
|
| 144 | # # 500 instances
|
| 145 | # line_span = (...)
|
| 146 | # # 455 instances
|
| 147 | # token = (
|
| 148 | # id id,
|
| 149 | # string val, # lengths: min 0, max 20, avg 30
|
| 150 | # int? span_id,
|
| 151 | # )
|
| 152 | #
|
| 153 | # command =
|
| 154 | # # 20 instances
|
| 155 | # NoOp
|
| 156 | # -- TODO: respect order
|
| 157 | # # 20 instances
|
| 158 | # | SimpleCommand(
|
| 159 | # word* words, # min length: 0, max: 10, mean: 3.3 ?
|
| 160 | # redir* redirects, # min length 0, max: 2, mean: 4.4
|
| 161 | # env_pair* more_env)
|
| 162 | # | Sentence(command child, token terminator)
|
| 163 | #
|
| 164 | # This might help with encoding things inline?
|
| 165 | # You will definitely need to append to ASDL arrays. I don't think you'll need
|
| 166 | # to append to strings. But you might want to store strings inline with
|
| 167 | # structs.
|
| 168 | # I guess it wouldn't hurt to print out a table of EVERY node an array, along
|
| 169 | # with the type.
|
| 170 | # parent_type,field_name,type,subtype,length
|
| 171 | # token,val,Str,-,5
|
| 172 | # SimpleCommand,redirects,Array,redirect,10
|
| 173 | #
|
| 174 | # This lets you figure out what the common types are, as well as the common
|
| 175 | # lengths.
|
| 176 |
|
| 177 | decode-all() {
|
| 178 | for bin in _tmp/oheap/*__oheap; do
|
| 179 | echo $bin
|
| 180 | time _tmp/osh_demo $bin | wc -l
|
| 181 | done
|
| 182 | }
|
| 183 |
|
| 184 | "$@"
|