1 | #!/usr/bin/env Rscript
|
2 | #
|
3 | # benchmarks/report.R -- Analyze data collected by shell scripts.
|
4 | #
|
5 | # Usage:
|
6 | # benchmarks/report.R OUT_DIR [TIMES_CSV...]
|
7 |
|
8 | # Suppress warnings about functions masked from 'package:stats' and 'package:base'
|
9 | # filter, lag
|
10 | # intersect, setdiff, setequal, union
|
11 | library(dplyr, warn.conflicts = FALSE)
|
12 | library(tidyr) # spread()
|
13 | library(stringr)
|
14 |
|
15 | source('benchmarks/common.R')
|
16 |
|
17 | options(stringsAsFactors = F)
|
18 |
|
19 | # For pretty printing
|
20 | commas = function(x) {
|
21 | format(x, big.mark=',')
|
22 | }
|
23 |
|
24 | sourceUrl = function(path) {
|
25 | sprintf('https://github.com/oilshell/oil/blob/master/%s', path)
|
26 | }
|
27 |
|
28 | # Takes a filename, not a path.
|
29 | sourceUrl2 = function(filename) {
|
30 | sprintf(
|
31 | 'https://github.com/oilshell/oil/blob/master/benchmarks/testdata/%s',
|
32 | filename)
|
33 | }
|
34 |
|
35 | mycppUrl = function(path) {
|
36 | sprintf('https://github.com/oilshell/oil/blob/master/mycpp/examples/%s.py', path)
|
37 | }
|
38 |
|
39 |
|
40 | # TODO: Set up cgit because Github links are slow.
|
41 | benchmarkDataLink = function(subdir, name, suffix) {
|
42 | #sprintf('../../../../benchmark-data/shell-id/%s', shell_id)
|
43 | sprintf('https://github.com/oilshell/benchmark-data/blob/master/%s/%s%s',
|
44 | subdir, name, suffix)
|
45 | }
|
46 |
|
47 | provenanceLink = function(subdir, name, suffix) {
|
48 | sprintf('../%s/%s%s', subdir, name, suffix)
|
49 | }
|
50 |
|
51 |
|
52 | GetOshLabel = function(shell_hash, prov_dir) {
|
53 | ### Given a string, return another string.
|
54 |
|
55 | path = sprintf('%s/shell-id/osh-%s/sh-path.txt', prov_dir, shell_hash)
|
56 |
|
57 | if (file.exists(path)) {
|
58 | Log('Reading %s', path)
|
59 | lines = readLines(path)
|
60 | if (length(grep('_bin/osh', lines)) > 0) {
|
61 | label = 'osh-ovm'
|
62 | } else if (length(grep('bin/osh', lines)) > 0) {
|
63 | label = 'osh-cpython'
|
64 | } else if (length(grep('_bin/.*/osh', lines)) > 0) {
|
65 | label = 'osh-native'
|
66 | } else {
|
67 | stop("Expected _bin/osh, bin/osh, or _bin/.*/osh")
|
68 | }
|
69 | } else {
|
70 | stop(sprintf("%s doesn't exist", path))
|
71 | }
|
72 | return(label)
|
73 | }
|
74 |
|
75 | opt_suffix1 = '_bin/cxx-opt/osh'
|
76 | opt_suffix2 = '_bin/cxx-opt-sh/osh'
|
77 |
|
78 | ShellLabels = function(shell_name, shell_hash, num_hosts) {
|
79 | ### Given 2 vectors, return a vector of readable labels.
|
80 |
|
81 | # TODO: Clean up callers. Some metrics all this function with a
|
82 | # shell/runtime BASENAME, and others a PATH
|
83 | # - e.g. ComputeReport calls this with runtime_name which is actually a PATH
|
84 |
|
85 | #Log('name %s', shell_name)
|
86 | #Log('hash %s', shell_hash)
|
87 |
|
88 | if (num_hosts == 1) {
|
89 | prov_dir = '_tmp'
|
90 | } else {
|
91 | prov_dir = '../benchmark-data/'
|
92 | }
|
93 |
|
94 | labels = c()
|
95 | for (i in 1:length(shell_name)) {
|
96 | sh = shell_name[i]
|
97 | if (sh == 'osh') {
|
98 | label = GetOshLabel(shell_hash[i], prov_dir)
|
99 |
|
100 | } else if (endsWith(sh, opt_suffix1) || endsWith(sh, opt_suffix2)) {
|
101 | label = 'opt/osh'
|
102 |
|
103 | } else if (endsWith(sh, '_bin/cxx-opt+bumpleak/osh')) {
|
104 | label = 'bumpleak/osh'
|
105 |
|
106 | } else {
|
107 | label = sh
|
108 | }
|
109 |
|
110 | Log('[%s] [%s]', shell_name[i], label)
|
111 | labels = c(labels, label)
|
112 | }
|
113 |
|
114 | return(labels)
|
115 | }
|
116 |
|
117 | # Simple version of the above, used by benchmarks/gc
|
118 | ShellLabelFromPath = function(sh_path) {
|
119 | labels = c()
|
120 | for (i in 1:length(sh_path)) {
|
121 | sh = sh_path[i]
|
122 |
|
123 | if (endsWith(sh, opt_suffix1) || endsWith(sh, opt_suffix2)) {
|
124 | # the opt binary is osh-native
|
125 | label = 'osh-native'
|
126 |
|
127 | } else if (endsWith(sh, '_bin/cxx-opt+bumpleak/osh')) {
|
128 | label = 'bumpleak/osh'
|
129 |
|
130 | } else if (endsWith(sh, '_bin/osh')) { # the app bundle
|
131 | label = 'osh-ovm'
|
132 |
|
133 | } else if (endsWith(sh, 'bin/osh')) {
|
134 | label = 'osh-cpython'
|
135 |
|
136 | } else {
|
137 | label = sh
|
138 | }
|
139 | labels = c(labels, label)
|
140 | }
|
141 | return(labels)
|
142 | }
|
143 |
|
144 | DistinctHosts = function(t) {
|
145 | t %>% distinct(host_name, host_hash) -> distinct_hosts
|
146 | # The label is just the name
|
147 | distinct_hosts$host_label = distinct_hosts$host_name
|
148 | return(distinct_hosts)
|
149 | }
|
150 |
|
151 | DistinctShells = function(t, num_hosts = -1) {
|
152 | t %>% distinct(shell_name, shell_hash) -> distinct_shells
|
153 |
|
154 | Log('')
|
155 | Log('Labeling shells')
|
156 |
|
157 | # Calculate it if not passed
|
158 | if (num_hosts == -1) {
|
159 | num_hosts = nrow(DistinctHosts(t))
|
160 | }
|
161 |
|
162 | distinct_shells$shell_label = ShellLabels(distinct_shells$shell_name,
|
163 | distinct_shells$shell_hash,
|
164 | num_hosts)
|
165 | return(distinct_shells)
|
166 | }
|
167 |
|
168 | ParserReport = function(in_dir, out_dir) {
|
169 | times = read.csv(file.path(in_dir, 'times.csv'))
|
170 | lines = read.csv(file.path(in_dir, 'lines.csv'))
|
171 | raw_data = read.csv(file.path(in_dir, 'raw-data.csv'))
|
172 |
|
173 | cachegrind = readTsv(file.path(in_dir, 'cachegrind.tsv'))
|
174 |
|
175 | # For joining by filename
|
176 | lines_by_filename = tibble(
|
177 | num_lines = lines$num_lines,
|
178 | filename = basename(lines$path)
|
179 | )
|
180 |
|
181 | # Remove failures
|
182 | times %>% filter(status == 0) %>% select(-c(status)) -> times
|
183 | cachegrind %>% filter(status == 0) %>% select(-c(status)) -> cachegrind
|
184 |
|
185 | # Add the number of lines, joining on path, and compute lines/ms
|
186 | times %>%
|
187 | left_join(lines, by = c('path')) %>%
|
188 | mutate(filename = basename(path), filename_HREF = sourceUrl(path),
|
189 | max_rss_MB = max_rss_KiB * 1024 / 1e6,
|
190 | elapsed_ms = elapsed_secs * 1000,
|
191 | user_ms = user_secs * 1000,
|
192 | sys_ms = sys_secs * 1000,
|
193 | lines_per_ms = num_lines / elapsed_ms) %>%
|
194 | select(-c(path, max_rss_KiB, elapsed_secs, user_secs, sys_secs)) ->
|
195 | joined_times
|
196 |
|
197 | #print(head(times))
|
198 | #print(head(lines))
|
199 | #print(head(vm))
|
200 | #print(head(joined_times))
|
201 |
|
202 | print(summary(joined_times))
|
203 |
|
204 | #
|
205 | # Find distinct shells and hosts, and label them for readability.
|
206 | #
|
207 |
|
208 | distinct_hosts = DistinctHosts(joined_times)
|
209 | Log('')
|
210 | Log('Distinct hosts')
|
211 | print(distinct_hosts)
|
212 |
|
213 | distinct_shells = DistinctShells(joined_times)
|
214 | Log('')
|
215 | Log('Distinct shells')
|
216 | print(distinct_shells)
|
217 |
|
218 | # Replace name/hash combinations with labels.
|
219 | joined_times %>%
|
220 | left_join(distinct_hosts, by = c('host_name', 'host_hash')) %>%
|
221 | left_join(distinct_shells, by = c('shell_name', 'shell_hash')) %>%
|
222 | select(-c(host_name, host_hash, shell_name, shell_hash)) ->
|
223 | joined_times
|
224 |
|
225 | # Like 'times', but do shell_label as one step
|
226 | # Hack: we know benchmarks/auto.sh runs this on one machine
|
227 | distinct_shells_2 = DistinctShells(cachegrind, num_hosts = nrow(distinct_hosts))
|
228 | cachegrind %>%
|
229 | left_join(lines, by = c('path')) %>%
|
230 | select(-c(elapsed_secs, user_secs, sys_secs, max_rss_KiB)) %>%
|
231 | left_join(distinct_shells_2, by = c('shell_name', 'shell_hash')) %>%
|
232 | select(-c(shell_name, shell_hash)) %>%
|
233 | mutate(filename = basename(path), filename_HREF = sourceUrl(path)) %>%
|
234 | select(-c(path)) ->
|
235 | joined_cachegrind
|
236 |
|
237 | Log('summary(joined_times):')
|
238 | print(summary(joined_times))
|
239 | Log('head(joined_times):')
|
240 | print(head(joined_times))
|
241 |
|
242 | # Summarize rates by platform/shell
|
243 | joined_times %>%
|
244 | mutate(host_label = paste("host", host_label)) %>%
|
245 | group_by(host_label, shell_label) %>%
|
246 | summarize(total_lines = sum(num_lines), total_ms = sum(elapsed_ms)) %>%
|
247 | mutate(lines_per_ms = total_lines / total_ms) %>%
|
248 | select(-c(total_ms)) %>%
|
249 | spread(key = host_label, value = lines_per_ms) ->
|
250 | times_summary
|
251 |
|
252 | # Sort by parsing rate on the fast machine
|
253 | if ("host lenny" %in% colnames(times_summary)) {
|
254 | times_summary %>% arrange(desc(`host lenny`)) -> times_summary
|
255 | } else {
|
256 | times_summary %>% arrange(desc(`host no-host`)) -> times_summary
|
257 | }
|
258 |
|
259 | Log('times_summary:')
|
260 | print(times_summary)
|
261 |
|
262 | # Summarize cachegrind by platform/shell
|
263 | # Bug fix: as.numeric(irefs) avoids 32-bit integer overflow!
|
264 | joined_cachegrind %>%
|
265 | group_by(shell_label) %>%
|
266 | summarize(total_lines = sum(num_lines), total_irefs = sum(as.numeric(irefs))) %>%
|
267 | mutate(thousand_irefs_per_line = total_irefs / total_lines / 1000) %>%
|
268 | select(-c(total_irefs)) ->
|
269 | cachegrind_summary
|
270 |
|
271 | if ("no-host" %in% distinct_hosts$host_label) {
|
272 |
|
273 | # We don't have all the shells
|
274 | elapsed = NULL
|
275 | rate = NULL
|
276 | max_rss = NULL
|
277 | instructions = NULL
|
278 |
|
279 | joined_times %>%
|
280 | select(c(shell_label, elapsed_ms, user_ms, sys_ms, max_rss_MB,
|
281 | num_lines, filename, filename_HREF)) %>%
|
282 | arrange(filename, elapsed_ms) ->
|
283 | times_flat
|
284 |
|
285 | joined_cachegrind %>%
|
286 | select(c(shell_label, irefs, num_lines, filename, filename_HREF)) %>%
|
287 | arrange(filename, irefs) ->
|
288 | cachegrind_flat
|
289 |
|
290 | } else {
|
291 |
|
292 | times_flat = NULL
|
293 | cachegrind_flat = NULL
|
294 |
|
295 | # Elapsed seconds for each shell by platform and file
|
296 | joined_times %>%
|
297 | select(-c(lines_per_ms, user_ms, sys_ms, max_rss_MB)) %>%
|
298 | spread(key = shell_label, value = elapsed_ms) %>%
|
299 | arrange(host_label, num_lines) %>%
|
300 | mutate(osh_to_bash_ratio = `osh-native` / bash) %>%
|
301 | select(c(host_label, bash, dash, mksh, zsh,
|
302 | `osh-ovm`, `osh-cpython`, `osh-native`,
|
303 | osh_to_bash_ratio, num_lines, filename, filename_HREF)) ->
|
304 | elapsed
|
305 |
|
306 | Log('\n')
|
307 | Log('ELAPSED')
|
308 | print(elapsed)
|
309 |
|
310 | # Rates by file and shell
|
311 | joined_times %>%
|
312 | select(-c(elapsed_ms, user_ms, sys_ms, max_rss_MB)) %>%
|
313 | spread(key = shell_label, value = lines_per_ms) %>%
|
314 | arrange(host_label, num_lines) %>%
|
315 | select(c(host_label, bash, dash, mksh, zsh,
|
316 | `osh-ovm`, `osh-cpython`, `osh-native`,
|
317 | num_lines, filename, filename_HREF)) ->
|
318 | rate
|
319 |
|
320 | Log('\n')
|
321 | Log('RATE')
|
322 | print(rate)
|
323 |
|
324 | # Memory usage by file
|
325 | joined_times %>%
|
326 | select(-c(elapsed_ms, lines_per_ms, user_ms, sys_ms)) %>%
|
327 | spread(key = shell_label, value = max_rss_MB) %>%
|
328 | arrange(host_label, num_lines) %>%
|
329 | select(c(host_label, bash, dash, mksh, zsh,
|
330 | `osh-ovm`, `osh-cpython`, `osh-native`,
|
331 | num_lines, filename, filename_HREF)) ->
|
332 | max_rss
|
333 |
|
334 | Log('\n')
|
335 | Log('MAX RSS')
|
336 | print(max_rss)
|
337 |
|
338 | Log('\n')
|
339 | Log('joined_cachegrind has %d rows', nrow(joined_cachegrind))
|
340 | print(joined_cachegrind)
|
341 | #print(joined_cachegrind %>% filter(path == 'benchmarks/testdata/configure-helper.sh'))
|
342 |
|
343 | # Cachegrind instructions by file
|
344 | joined_cachegrind %>%
|
345 | mutate(thousand_irefs_per_line = irefs / num_lines / 1000) %>%
|
346 | select(-c(irefs)) %>%
|
347 | spread(key = shell_label, value = thousand_irefs_per_line) %>%
|
348 | arrange(num_lines) %>%
|
349 | select(c(bash, dash, mksh, `osh-native`,
|
350 | num_lines, filename, filename_HREF)) ->
|
351 | instructions
|
352 |
|
353 | Log('\n')
|
354 | Log('instructions has %d rows', nrow(instructions))
|
355 | print(instructions)
|
356 | }
|
357 |
|
358 | WriteProvenance(distinct_hosts, distinct_shells, out_dir)
|
359 |
|
360 | raw_data_table = tibble(
|
361 | filename = basename(as.character(raw_data$path)),
|
362 | filename_HREF = benchmarkDataLink('osh-parser', filename, '')
|
363 | )
|
364 | #print(raw_data_table)
|
365 |
|
366 | writeCsv(raw_data_table, file.path(out_dir, 'raw-data'))
|
367 |
|
368 | precision = SamePrecision(0) # lines per ms
|
369 | writeCsv(times_summary, file.path(out_dir, 'summary'), precision)
|
370 |
|
371 | precision = ColumnPrecision(list(), default = 1)
|
372 | writeTsv(cachegrind_summary, file.path(out_dir, 'cachegrind_summary'), precision)
|
373 |
|
374 | if (!is.null(times_flat)) {
|
375 | precision = SamePrecision(0)
|
376 | writeTsv(times_flat, file.path(out_dir, 'times_flat'), precision)
|
377 | }
|
378 |
|
379 | if (!is.null(cachegrind_flat)) {
|
380 | precision = SamePrecision(0)
|
381 | writeTsv(cachegrind_flat, file.path(out_dir, 'cachegrind_flat'), precision)
|
382 | }
|
383 |
|
384 | if (!is.null(elapsed)) { # equivalent to no-host
|
385 | # Round to nearest millisecond, but the ratio has a decimal point.
|
386 | precision = ColumnPrecision(list(osh_to_bash_ratio = 1), default = 0)
|
387 | writeCsv(elapsed, file.path(out_dir, 'elapsed'), precision)
|
388 |
|
389 | precision = SamePrecision(0)
|
390 | writeCsv(rate, file.path(out_dir, 'rate'), precision)
|
391 |
|
392 | writeCsv(max_rss, file.path(out_dir, 'max_rss'))
|
393 |
|
394 | precision = SamePrecision(1)
|
395 | writeTsv(instructions, file.path(out_dir, 'instructions'), precision)
|
396 | }
|
397 |
|
398 | Log('Wrote %s', out_dir)
|
399 | }
|
400 |
|
401 | WriteProvenance = function(distinct_hosts, distinct_shells, out_dir, tsv = F) {
|
402 |
|
403 | num_hosts = nrow(distinct_hosts)
|
404 | if (num_hosts == 1) {
|
405 | linkify = provenanceLink
|
406 | } else {
|
407 | linkify = benchmarkDataLink
|
408 | }
|
409 |
|
410 | Log('distinct_hosts')
|
411 | print(distinct_hosts)
|
412 | Log('')
|
413 |
|
414 | Log('distinct_shells')
|
415 | print(distinct_shells)
|
416 | Log('')
|
417 |
|
418 | # Should be:
|
419 | # host_id_url
|
420 | # And then csv_to_html will be smart enough? It should take --url flag?
|
421 | host_table = tibble(
|
422 | host_label = distinct_hosts$host_label,
|
423 | host_id = paste(distinct_hosts$host_name,
|
424 | distinct_hosts$host_hash, sep='-'),
|
425 | host_id_HREF = linkify('host-id', host_id, '/')
|
426 | )
|
427 | Log('host_table')
|
428 | print(host_table)
|
429 | Log('')
|
430 |
|
431 | shell_table = tibble(
|
432 | shell_label = distinct_shells$shell_label,
|
433 | shell_id = paste(distinct_shells$shell_name,
|
434 | distinct_shells$shell_hash, sep='-'),
|
435 | shell_id_HREF = linkify('shell-id', shell_id, '/')
|
436 | )
|
437 |
|
438 | Log('shell_table')
|
439 | print(shell_table)
|
440 | Log('')
|
441 |
|
442 | if (tsv) {
|
443 | writeTsv(host_table, file.path(out_dir, 'hosts'))
|
444 | writeTsv(shell_table, file.path(out_dir, 'shells'))
|
445 | } else {
|
446 | writeCsv(host_table, file.path(out_dir, 'hosts'))
|
447 | writeCsv(shell_table, file.path(out_dir, 'shells'))
|
448 | }
|
449 | }
|
450 |
|
451 | WriteSimpleProvenance = function(provenance, out_dir) {
|
452 | Log('provenance')
|
453 | print(provenance)
|
454 | Log('')
|
455 |
|
456 | # Legacy: add $shell_name, because "$shell_basename-$shell_hash" is what
|
457 | # benchmarks/id.sh publish-shell-id uses
|
458 | provenance %>%
|
459 | mutate(shell_name = basename(sh_path)) %>%
|
460 | distinct(shell_label, shell_name, shell_hash) ->
|
461 | distinct_shells
|
462 |
|
463 | Log('distinct_shells')
|
464 | print(distinct_shells)
|
465 | Log('')
|
466 |
|
467 | provenance %>% distinct(host_label, host_name, host_hash) -> distinct_hosts
|
468 |
|
469 | WriteProvenance(distinct_hosts, distinct_shells, out_dir, tsv = T)
|
470 | }
|
471 |
|
472 | RuntimeReport = function(in_dir, out_dir) {
|
473 | times = readTsv(file.path(in_dir, 'times.tsv'))
|
474 |
|
475 | gc_stats = readTsv(file.path(in_dir, 'gc_stats.tsv'))
|
476 | provenance = readTsv(file.path(in_dir, 'provenance.tsv'))
|
477 |
|
478 | times %>% filter(status != 0) -> failed
|
479 | if (nrow(failed) != 0) {
|
480 | print(failed)
|
481 | stop('Some osh-runtime tasks failed')
|
482 | }
|
483 |
|
484 | # Joins:
|
485 | # times <= sh_path => provenance
|
486 | # times <= join_id, host_name => gc_stats
|
487 |
|
488 | # TODO: provenance may have rows from 2 machines. Could validate them and
|
489 | # deduplicate.
|
490 |
|
491 | # It should have (host_label, host_name, host_hash)
|
492 | # (shell_label, sh_path, shell_hash)
|
493 | provenance %>%
|
494 | mutate(host_label = host_name, shell_label = ShellLabelFromPath(sh_path)) ->
|
495 | provenance
|
496 |
|
497 | provenance %>% distinct(sh_path, shell_label) -> label_lookup
|
498 |
|
499 | Log('label_lookup')
|
500 | print(label_lookup)
|
501 |
|
502 | # Join with provenance for host label and shell label
|
503 | times %>%
|
504 | select(c(elapsed_secs, user_secs, sys_secs, max_rss_KiB, task_id, host_name, sh_path, workload)) %>%
|
505 | mutate(elapsed_ms = elapsed_secs * 1000,
|
506 | user_ms = user_secs * 1000,
|
507 | sys_ms = sys_secs * 1000,
|
508 | max_rss_MB = max_rss_KiB * 1024 / 1e6) %>%
|
509 | select(-c(elapsed_secs, user_secs, sys_secs, max_rss_KiB)) %>%
|
510 | left_join(label_lookup, by = c('sh_path')) %>%
|
511 | select(-c(sh_path)) %>%
|
512 | # we want to compare workloads on adjacent rows
|
513 | arrange(workload) ->
|
514 | details
|
515 |
|
516 | times %>%
|
517 | select(c(task_id, host_name, sh_path, workload, minor_faults, major_faults, swaps, in_block, out_block, signals, voluntary_ctx, involuntary_ctx)) %>%
|
518 | left_join(label_lookup, by = c('sh_path')) %>%
|
519 | select(-c(sh_path)) %>%
|
520 | # we want to compare workloads on adjacent rows
|
521 | arrange(workload) ->
|
522 | details_io
|
523 |
|
524 | Log('details')
|
525 | print(details)
|
526 |
|
527 | # Elapsed time comparison
|
528 | details %>%
|
529 | select(-c(task_id, user_ms, sys_ms, max_rss_MB)) %>%
|
530 | spread(key = shell_label, value = elapsed_ms) %>%
|
531 | mutate(py_bash_ratio = `osh-cpython` / bash) %>%
|
532 | mutate(native_bash_ratio = `osh-native` / bash) %>%
|
533 | arrange(workload, host_name) %>%
|
534 | select(c(workload, host_name,
|
535 | bash, dash, `osh-cpython`, `osh-native`,
|
536 | py_bash_ratio, native_bash_ratio)) ->
|
537 |
|
538 | elapsed
|
539 |
|
540 | Log('elapsed')
|
541 | print(elapsed)
|
542 |
|
543 | # Minor Page Faults Comparison
|
544 | details_io %>%
|
545 | select(c(host_name, shell_label, workload, minor_faults)) %>%
|
546 | spread(key = shell_label, value = minor_faults) %>%
|
547 | mutate(py_bash_ratio = `osh-cpython` / bash) %>%
|
548 | mutate(native_bash_ratio = `osh-native` / bash) %>%
|
549 | arrange(workload, host_name) %>%
|
550 | select(c(workload, host_name,
|
551 | bash, dash, `osh-cpython`, `osh-native`,
|
552 | py_bash_ratio, native_bash_ratio)) ->
|
553 | page_faults
|
554 |
|
555 | Log('page_faults')
|
556 | print(page_faults)
|
557 |
|
558 | # Max RSS comparison
|
559 | details %>%
|
560 | select(-c(task_id, elapsed_ms, user_ms, sys_ms)) %>%
|
561 | spread(key = shell_label, value = max_rss_MB) %>%
|
562 | mutate(py_bash_ratio = `osh-cpython` / bash) %>%
|
563 | mutate(native_bash_ratio = `osh-native` / bash) %>%
|
564 | arrange(workload, host_name) %>%
|
565 | select(c(workload, host_name,
|
566 | bash, dash, `osh-cpython`, `osh-native`,
|
567 | py_bash_ratio, native_bash_ratio)) ->
|
568 | max_rss
|
569 |
|
570 | Log('max rss')
|
571 | print(max_rss)
|
572 |
|
573 | details %>%
|
574 | select(c(task_id, host_name, workload, elapsed_ms, max_rss_MB)) %>%
|
575 | mutate(join_id = sprintf("gc-%d", task_id)) %>%
|
576 | select(-c(task_id)) ->
|
577 | gc_details
|
578 |
|
579 | Log('GC stats')
|
580 | print(gc_stats)
|
581 |
|
582 | gc_stats %>%
|
583 | left_join(gc_details, by = c('join_id', 'host_name')) %>%
|
584 | select(-c(join_id, roots_capacity, objs_capacity)) %>%
|
585 | # Do same transformations as GcReport()
|
586 | mutate(allocated_MB = bytes_allocated / 1e6) %>%
|
587 | select(-c(bytes_allocated)) %>%
|
588 | rename(num_gc_done = num_collections) %>%
|
589 | # Put these columns first
|
590 | relocate(workload, host_name,
|
591 | elapsed_ms, max_gc_millis, total_gc_millis,
|
592 | allocated_MB, max_rss_MB, num_allocated) ->
|
593 | gc_stats
|
594 |
|
595 | Log('After GC stats')
|
596 | print(gc_stats)
|
597 |
|
598 | WriteSimpleProvenance(provenance, out_dir)
|
599 |
|
600 | # milliseconds don't need decimal digit
|
601 | precision = ColumnPrecision(list(bash = 0, dash = 0, `osh-cpython` = 0,
|
602 | `osh-native` = 0, py_bash_ratio = 2,
|
603 | native_bash_ratio = 2))
|
604 | writeTsv(elapsed, file.path(out_dir, 'elapsed'), precision)
|
605 | writeTsv(page_faults, file.path(out_dir, 'page_faults'), precision)
|
606 |
|
607 | precision2 = ColumnPrecision(list(py_bash_ratio = 2, native_bash_ratio = 2))
|
608 | writeTsv(max_rss, file.path(out_dir, 'max_rss'), precision2)
|
609 |
|
610 | precision3 = ColumnPrecision(list(max_rss_MB = 1, allocated_MB = 1),
|
611 | default = 0)
|
612 | writeTsv(gc_stats, file.path(out_dir, 'gc_stats'), precision3)
|
613 |
|
614 | writeTsv(details, file.path(out_dir, 'details'), precision3)
|
615 | writeTsv(details_io, file.path(out_dir, 'details_io'))
|
616 |
|
617 | Log('Wrote %s', out_dir)
|
618 | }
|
619 |
|
620 | VmBaselineReport = function(in_dir, out_dir) {
|
621 | vm = read.csv(file.path(in_dir, 'vm-baseline.csv'))
|
622 | #print(vm)
|
623 |
|
624 | # Not using DistinctHosts() because field host_hash isn't collected
|
625 | num_hosts = nrow(vm %>% distinct(host))
|
626 |
|
627 | vm %>%
|
628 | rename(kib = metric_value) %>%
|
629 | mutate(shell_label = ShellLabels(shell_name, shell_hash, num_hosts),
|
630 | megabytes = kib * 1024 / 1e6) %>%
|
631 | select(-c(shell_name, kib)) %>%
|
632 | spread(key = c(metric_name), value = megabytes) %>%
|
633 | rename(VmPeak_MB = VmPeak, VmRSS_MB = VmRSS) %>%
|
634 | select(c(shell_label, shell_hash, host, VmRSS_MB, VmPeak_MB)) %>%
|
635 | arrange(shell_label, shell_hash, host, VmPeak_MB) ->
|
636 | vm
|
637 |
|
638 | print(vm)
|
639 |
|
640 | writeCsv(vm, file.path(out_dir, 'vm-baseline'))
|
641 | }
|
642 |
|
643 | WriteOvmBuildDetails = function(distinct_hosts, distinct_compilers, out_dir) {
|
644 | host_table = tibble(
|
645 | host_label = distinct_hosts$host_label,
|
646 | host_id = paste(distinct_hosts$host_name,
|
647 | distinct_hosts$host_hash, sep='-'),
|
648 | host_id_HREF = benchmarkDataLink('host-id', host_id, '/')
|
649 | )
|
650 | print(host_table)
|
651 |
|
652 | dc = distinct_compilers
|
653 | compiler_table = tibble(
|
654 | compiler_label = dc$compiler_label,
|
655 | compiler_id = paste(dc$compiler_label, dc$compiler_hash, sep='-'),
|
656 | compiler_id_HREF = benchmarkDataLink('compiler-id', compiler_id, '/')
|
657 | )
|
658 | print(compiler_table)
|
659 |
|
660 | writeTsv(host_table, file.path(out_dir, 'hosts'))
|
661 | writeTsv(compiler_table, file.path(out_dir, 'compilers'))
|
662 | }
|
663 |
|
664 | OvmBuildReport = function(in_dir, out_dir) {
|
665 | times = readTsv(file.path(in_dir, 'times.tsv'))
|
666 | bytecode_size = readTsv(file.path(in_dir, 'bytecode-size.tsv'))
|
667 | bin_sizes = readTsv(file.path(in_dir, 'bin-sizes.tsv'))
|
668 | native_sizes = readTsv(file.path(in_dir, 'native-sizes.tsv'))
|
669 | raw_data = readTsv(file.path(in_dir, 'raw-data.tsv'))
|
670 |
|
671 | times %>% filter(status != 0) -> failed
|
672 | if (nrow(failed) != 0) {
|
673 | print(failed)
|
674 | stop('Some ovm-build tasks failed')
|
675 | }
|
676 |
|
677 | times %>% distinct(host_name, host_hash) -> distinct_hosts
|
678 | distinct_hosts$host_label = distinct_hosts$host_name
|
679 |
|
680 | times %>% distinct(compiler_path, compiler_hash) -> distinct_compilers
|
681 | distinct_compilers$compiler_label = basename(distinct_compilers$compiler_path)
|
682 |
|
683 | #print(distinct_hosts)
|
684 | #print(distinct_compilers)
|
685 |
|
686 | WriteOvmBuildDetails(distinct_hosts, distinct_compilers, out_dir)
|
687 |
|
688 | times %>%
|
689 | select(-c(status)) %>%
|
690 | left_join(distinct_hosts, by = c('host_name', 'host_hash')) %>%
|
691 | left_join(distinct_compilers, by = c('compiler_path', 'compiler_hash')) %>%
|
692 | select(-c(host_name, host_hash, compiler_path, compiler_hash)) %>%
|
693 | mutate(src_dir = basename(src_dir),
|
694 | host_label = paste("host ", host_label),
|
695 | is_conf = str_detect(action, 'configure'),
|
696 | is_ovm = str_detect(action, 'oil.ovm'),
|
697 | is_dbg = str_detect(action, 'dbg'),
|
698 | ) %>%
|
699 | select(host_label, src_dir, compiler_label, action, is_conf, is_ovm, is_dbg,
|
700 | elapsed_secs) %>%
|
701 | spread(key = c(host_label), value = elapsed_secs) %>%
|
702 | arrange(src_dir, compiler_label, desc(is_conf), is_ovm, desc(is_dbg)) %>%
|
703 | select(-c(is_conf, is_ovm, is_dbg)) ->
|
704 | times
|
705 |
|
706 | #print(times)
|
707 |
|
708 | bytecode_size %>%
|
709 | rename(bytecode_size = num_bytes) %>%
|
710 | select(-c(path)) ->
|
711 | bytecode_size
|
712 |
|
713 | bin_sizes %>%
|
714 | # reorder
|
715 | select(c(host_label, path, num_bytes)) %>%
|
716 | left_join(bytecode_size, by = c('host_label')) %>%
|
717 | mutate(native_code_size = num_bytes - bytecode_size) ->
|
718 | sizes
|
719 |
|
720 | # paths look like _tmp/ovm-build/bin/clang/oils_cpp.stripped
|
721 | native_sizes %>%
|
722 | select(c(host_label, path, num_bytes)) %>%
|
723 | mutate(host_label = paste("host ", host_label),
|
724 | binary = basename(path),
|
725 | compiler = basename(dirname(path)),
|
726 | ) %>%
|
727 | select(-c(path)) %>%
|
728 | spread(key = c(host_label), value = num_bytes) %>%
|
729 | arrange(compiler, binary) ->
|
730 | native_sizes
|
731 |
|
732 | # NOTE: These don't have the host and compiler.
|
733 | writeTsv(times, file.path(out_dir, 'times'))
|
734 | writeTsv(bytecode_size, file.path(out_dir, 'bytecode-size'))
|
735 | writeTsv(sizes, file.path(out_dir, 'sizes'))
|
736 | writeTsv(native_sizes, file.path(out_dir, 'native-sizes'))
|
737 |
|
738 | # TODO: I want a size report too
|
739 | #writeCsv(sizes, file.path(out_dir, 'sizes'))
|
740 | }
|
741 |
|
742 | unique_stdout_md5sum = function(t, num_expected) {
|
743 | u = n_distinct(t$stdout_md5sum)
|
744 | if (u != num_expected) {
|
745 | t %>% select(c(host_name, task_name, arg1, arg2, runtime_name, stdout_md5sum)) %>% print()
|
746 | stop(sprintf('Expected %d unique md5sums, got %d', num_expected, u))
|
747 | }
|
748 | }
|
749 |
|
750 | ComputeReport = function(in_dir, out_dir) {
|
751 | # TSV file, not CSV
|
752 | times = read.table(file.path(in_dir, 'times.tsv'), header=T)
|
753 | print(times)
|
754 |
|
755 | times %>% filter(status != 0) -> failed
|
756 | if (nrow(failed) != 0) {
|
757 | print(failed)
|
758 | stop('Some compute tasks failed')
|
759 | }
|
760 |
|
761 | #
|
762 | # Check correctness
|
763 | #
|
764 |
|
765 | times %>% filter(task_name == 'hello') %>% unique_stdout_md5sum(1)
|
766 | times %>% filter(task_name == 'fib') %>% unique_stdout_md5sum(1)
|
767 | times %>% filter(task_name == 'word_freq') %>% unique_stdout_md5sum(1)
|
768 | # 3 different inputs
|
769 | times %>% filter(task_name == 'parse_help') %>% unique_stdout_md5sum(3)
|
770 |
|
771 | times %>% filter(task_name == 'bubble_sort') %>% unique_stdout_md5sum(2)
|
772 |
|
773 | # TODO:
|
774 | # - oils_cpp doesn't implement unicode LANG=C
|
775 | # - bash behaves differently on your desktop vs. in the container
|
776 | # - might need layer-locales in the image?
|
777 |
|
778 | #times %>% filter(task_name == 'palindrome' & arg1 == 'unicode') %>% unique_stdout_md5sum(1)
|
779 | # Ditto here
|
780 | #times %>% filter(task_name == 'palindrome' & arg1 == 'bytes') %>% unique_stdout_md5sum(1)
|
781 |
|
782 | #
|
783 | # Find distinct shells and hosts, and label them for readability.
|
784 | #
|
785 |
|
786 | # Runtimes are called shells, as a hack for code reuse
|
787 | times %>%
|
788 | mutate(shell_name = runtime_name, shell_hash = runtime_hash) %>%
|
789 | select(c(host_name, host_hash, shell_name, shell_hash)) ->
|
790 | tmp
|
791 |
|
792 | distinct_hosts = DistinctHosts(tmp)
|
793 | Log('')
|
794 | Log('Distinct hosts')
|
795 | print(distinct_hosts)
|
796 |
|
797 | distinct_shells = DistinctShells(tmp)
|
798 | Log('')
|
799 | Log('Distinct runtimes')
|
800 | print(distinct_shells)
|
801 |
|
802 | num_hosts = nrow(distinct_hosts)
|
803 |
|
804 | times %>%
|
805 | select(-c(status, stdout_md5sum, stdout_filename, host_hash, runtime_hash)) %>%
|
806 | mutate(runtime_label = ShellLabels(runtime_name, runtime_hash, num_hosts),
|
807 | elapsed_ms = elapsed_secs * 1000,
|
808 | user_ms = user_secs * 1000,
|
809 | sys_ms = sys_secs * 1000,
|
810 | max_rss_MB = max_rss_KiB * 1024 / 1e6) %>%
|
811 | select(-c(runtime_name, elapsed_secs, user_secs, sys_secs, max_rss_KiB)) %>%
|
812 | arrange(host_name, task_name, arg1, arg2, user_ms) ->
|
813 | details
|
814 |
|
815 | times %>%
|
816 | mutate(
|
817 | runtime_label = ShellLabels(runtime_name, runtime_hash, num_hosts),
|
818 | stdout_md5sum_HREF = file.path('tmp', task_name, stdout_filename)) %>%
|
819 | select(c(host_name, task_name, arg1, arg2, runtime_label,
|
820 | stdout_md5sum, stdout_md5sum_HREF)) ->
|
821 | stdout_files
|
822 |
|
823 | details %>% filter(task_name == 'hello') %>% select(-c(task_name)) -> hello
|
824 | details %>% filter(task_name == 'fib') %>% select(-c(task_name)) -> fib
|
825 | details %>% filter(task_name == 'word_freq') %>% select(-c(task_name)) -> word_freq
|
826 | # There's no arg2
|
827 | details %>% filter(task_name == 'parse_help') %>% select(-c(task_name, arg2)) -> parse_help
|
828 |
|
829 | details %>% filter(task_name == 'bubble_sort') %>% select(-c(task_name)) -> bubble_sort
|
830 | details %>% filter(task_name == 'palindrome' & arg1 == 'unicode') %>% select(-c(task_name)) -> palindrome
|
831 |
|
832 | precision = ColumnPrecision(list(max_rss_MB = 1), default = 0)
|
833 | writeTsv(details, file.path(out_dir, 'details'), precision)
|
834 |
|
835 | writeTsv(stdout_files, file.path(out_dir, 'stdout_files'), precision)
|
836 |
|
837 | writeTsv(hello, file.path(out_dir, 'hello'), precision)
|
838 | writeTsv(fib, file.path(out_dir, 'fib'), precision)
|
839 | writeTsv(word_freq, file.path(out_dir, 'word_freq'), precision)
|
840 | writeTsv(parse_help, file.path(out_dir, 'parse_help'), precision)
|
841 |
|
842 | writeTsv(bubble_sort, file.path(out_dir, 'bubble_sort'), precision)
|
843 | writeTsv(palindrome, file.path(out_dir, 'palindrome'), precision)
|
844 |
|
845 | WriteProvenance(distinct_hosts, distinct_shells, out_dir, tsv = T)
|
846 | }
|
847 |
|
848 | WriteOneTask = function(times, out_dir, task_name, precision) {
|
849 | times %>%
|
850 | filter(task == task_name) %>%
|
851 | select(-c(task)) -> subset
|
852 |
|
853 | writeTsv(subset, file.path(out_dir, task_name), precision)
|
854 | }
|
855 |
|
856 | SHELL_ORDER = c('dash',
|
857 | 'bash',
|
858 | 'zsh',
|
859 | '_bin/cxx-opt+bumpleak/osh',
|
860 | '_bin/cxx-opt+bumproot/osh',
|
861 | '_bin/cxx-opt+bumpsmall/osh',
|
862 | '_bin/cxx-opt/osh',
|
863 | '_bin/cxx-opt+nopool/osh')
|
864 |
|
865 | GcReport = function(in_dir, out_dir) {
|
866 | times = read.table(file.path(in_dir, 'raw/times.tsv'), header=T)
|
867 | gc_stats = read.table(file.path(in_dir, 'stage1/gc_stats.tsv'), header=T)
|
868 |
|
869 | times %>% filter(status != 0) -> failed
|
870 | if (nrow(failed) != 0) {
|
871 | print(failed)
|
872 | stop('Some gc tasks failed')
|
873 | }
|
874 |
|
875 | # Change units and order columns
|
876 | times %>%
|
877 | arrange(task, factor(sh_path, levels = SHELL_ORDER)) %>%
|
878 | mutate(elapsed_ms = elapsed_secs * 1000,
|
879 | user_ms = user_secs * 1000,
|
880 | sys_ms = sys_secs * 1000,
|
881 | max_rss_MB = max_rss_KiB * 1024 / 1e6,
|
882 | shell_label = ShellLabelFromPath(sh_path)
|
883 | ) %>%
|
884 | select(c(join_id, task, elapsed_ms, user_ms, sys_ms, max_rss_MB, shell_label,
|
885 | shell_runtime_opts)) ->
|
886 | times
|
887 |
|
888 | # Join and order columns
|
889 | gc_stats %>% left_join(times, by = c('join_id')) %>%
|
890 | arrange(desc(task)) %>%
|
891 | mutate(allocated_MB = bytes_allocated / 1e6) %>%
|
892 | # try to make the table skinnier
|
893 | rename(num_gc_done = num_collections) %>%
|
894 | select(task, elapsed_ms, max_gc_millis, total_gc_millis,
|
895 | allocated_MB, max_rss_MB, num_allocated,
|
896 | num_gc_points, num_gc_done, gc_threshold, num_growths, max_survived,
|
897 | shell_label) ->
|
898 | gc_stats
|
899 |
|
900 | times %>% select(-c(join_id)) -> times
|
901 |
|
902 |
|
903 | precision = ColumnPrecision(list(max_rss_MB = 1, allocated_MB = 1),
|
904 | default = 0)
|
905 |
|
906 | writeTsv(times, file.path(out_dir, 'times'), precision)
|
907 | writeTsv(gc_stats, file.path(out_dir, 'gc_stats'), precision)
|
908 |
|
909 | tasks = c('parse.configure-coreutils',
|
910 | 'parse.configure-cpython',
|
911 | 'parse.abuild',
|
912 | 'ex.compute-fib',
|
913 | 'ex.bashcomp-parse-help',
|
914 | 'ex.abuild-print-help')
|
915 | # Write out separate rows
|
916 | for (task in tasks) {
|
917 | WriteOneTask(times, out_dir, task, precision)
|
918 | }
|
919 | }
|
920 |
|
921 | GcCachegrindReport = function(in_dir, out_dir) {
|
922 | times = readTsv(file.path(in_dir, 'raw/times.tsv'))
|
923 | counts = readTsv(file.path(in_dir, 'stage1/cachegrind.tsv'))
|
924 |
|
925 | times %>% filter(status != 0) -> failed
|
926 | if (nrow(failed) != 0) {
|
927 | print(failed)
|
928 | stop('Some gc tasks failed')
|
929 | }
|
930 |
|
931 | print(times)
|
932 | print(counts)
|
933 |
|
934 | counts %>% left_join(times, by = c('join_id')) %>%
|
935 | mutate(million_irefs = irefs / 1e6) %>%
|
936 | select(c(million_irefs, task, sh_path, shell_runtime_opts)) %>%
|
937 | arrange(factor(sh_path, levels = SHELL_ORDER)) ->
|
938 | counts
|
939 |
|
940 | precision = NULL
|
941 | tasks = c('parse.abuild', 'ex.compute-fib')
|
942 | for (task in tasks) {
|
943 | WriteOneTask(counts, out_dir, task, precision)
|
944 | }
|
945 | }
|
946 |
|
947 | MyCppReport = function(in_dir, out_dir) {
|
948 | times = readTsv(file.path(in_dir, 'benchmark-table.tsv'))
|
949 | print(times)
|
950 |
|
951 | times %>% filter(status != 0) -> failed
|
952 | if (nrow(failed) != 0) {
|
953 | print(failed)
|
954 | stop('Some mycpp tasks failed')
|
955 | }
|
956 |
|
957 | # Don't care about elapsed and system
|
958 | times %>% select(-c(status, elapsed_secs, bin, task_out)) %>%
|
959 | mutate(example_name_HREF = mycppUrl(example_name),
|
960 | user_ms = user_secs * 1000,
|
961 | sys_ms = sys_secs * 1000,
|
962 | max_rss_MB = max_rss_KiB * 1024 / 1e6) %>%
|
963 | select(-c(user_secs, sys_secs, max_rss_KiB)) ->
|
964 | details
|
965 |
|
966 | details %>% select(-c(sys_ms, max_rss_MB)) %>%
|
967 | spread(key = impl, value = user_ms) %>%
|
968 | mutate(`C++ : Python` = `C++` / Python) %>%
|
969 | arrange(`C++ : Python`) ->
|
970 | user_time
|
971 |
|
972 | details %>% select(-c(user_ms, max_rss_MB)) %>%
|
973 | spread(key = impl, value = sys_ms) %>%
|
974 | mutate(`C++ : Python` = `C++` / Python) %>%
|
975 | arrange(`C++ : Python`) ->
|
976 | sys_time
|
977 |
|
978 | details %>% select(-c(user_ms, sys_ms)) %>%
|
979 | spread(key = impl, value = max_rss_MB) %>%
|
980 | mutate(`C++ : Python` = `C++` / Python) %>%
|
981 | arrange(`C++ : Python`) ->
|
982 | max_rss
|
983 |
|
984 | # Sometimes it speeds up by more than 10x
|
985 | precision1 = ColumnPrecision(list(`C++ : Python` = 3), default = 0)
|
986 | writeTsv(user_time, file.path(out_dir, 'user_time'), precision1)
|
987 | writeTsv(sys_time, file.path(out_dir, 'sys_time'), precision1)
|
988 |
|
989 | precision2 = ColumnPrecision(list(`C++ : Python` = 2), default = 1)
|
990 | writeTsv(max_rss, file.path(out_dir, 'max_rss'), precision2)
|
991 |
|
992 | writeTsv(details, file.path(out_dir, 'details'))
|
993 | }
|
994 |
|
995 | UftraceTaskReport = function(env, task_name, summaries) {
|
996 | # Need this again after redirect
|
997 | MaybeDisableColor(stdout())
|
998 |
|
999 | task_env = env[[task_name]]
|
1000 |
|
1001 | untyped = task_env$untyped
|
1002 | typed = task_env$typed
|
1003 | strings = task_env$strings
|
1004 | slabs = task_env$slabs
|
1005 | reserve = task_env$reserve
|
1006 |
|
1007 | string_overhead = 17 # GC header (8) + len (4) + hash value (4) + NUL (1)
|
1008 | strings %>% mutate(obj_len = str_len + string_overhead) -> strings
|
1009 |
|
1010 | # TODO: Output these totals PER WORKLOAD, e.g. parsing big/small, executing
|
1011 | # big/small
|
1012 | #
|
1013 | # And then zoom in on distributions as well
|
1014 |
|
1015 | num_allocs = nrow(untyped)
|
1016 | total_bytes = sum(untyped$obj_len)
|
1017 |
|
1018 | untyped %>% group_by(obj_len) %>% count() %>% ungroup() -> untyped_hist
|
1019 | #print(untyped_hist)
|
1020 |
|
1021 | untyped_hist %>%
|
1022 | mutate(n_less_than = cumsum(n),
|
1023 | percent = n_less_than * 100.0 / num_allocs) ->
|
1024 | alloc_sizes
|
1025 |
|
1026 | a24 = untyped_hist %>% filter(obj_len <= 24)
|
1027 | a48 = untyped_hist %>% filter(obj_len <= 48)
|
1028 | a96 = untyped_hist %>% filter(obj_len <= 96)
|
1029 |
|
1030 | allocs_24_bytes_or_less = sum(a24$n) * 100.0 / num_allocs
|
1031 | allocs_48_bytes_or_less = sum(a48$n) * 100.0 / num_allocs
|
1032 | allocs_96_bytes_or_less = sum(a96$n) * 100.0 / num_allocs
|
1033 |
|
1034 | Log('Percentage of allocs less than 48 bytes: %.1f', allocs_48_bytes_or_less)
|
1035 |
|
1036 | options(tibble.print_min=25)
|
1037 |
|
1038 | Log('')
|
1039 | Log('All allocations')
|
1040 | print(alloc_sizes %>% head(22))
|
1041 | print(alloc_sizes %>% tail(5))
|
1042 |
|
1043 | Log('')
|
1044 | Log('Common Sizes')
|
1045 | print(untyped_hist %>% arrange(desc(n)) %>% head(8))
|
1046 |
|
1047 | Log('')
|
1048 | Log(' %s total allocations, total bytes = %s', commas(num_allocs), commas(total_bytes))
|
1049 | Log('')
|
1050 |
|
1051 | Log('Typed allocations')
|
1052 |
|
1053 | num_typed = nrow(typed)
|
1054 |
|
1055 | typed %>% group_by(func_name) %>% count() %>% ungroup() %>%
|
1056 | mutate(percent = n * 100.0 / num_typed) %>%
|
1057 | arrange(desc(n)) -> most_common_types
|
1058 |
|
1059 | print(most_common_types %>% head(20))
|
1060 | print(most_common_types %>% tail(5))
|
1061 |
|
1062 | lists = typed %>% filter(str_starts(func_name, ('List<')))
|
1063 | #print(lists)
|
1064 |
|
1065 | num_lists = nrow(lists)
|
1066 | total_list_bytes = num_lists * 24 # sizeof List<T> head is hard-coded
|
1067 |
|
1068 | Log('')
|
1069 | Log('%s typed allocs, including %s List<T>', commas(num_typed), commas(num_lists))
|
1070 | Log('%.2f%% of allocs are typed', num_typed * 100 / num_allocs)
|
1071 | Log('')
|
1072 |
|
1073 | #
|
1074 | # Strings
|
1075 | #
|
1076 |
|
1077 | num_strings = nrow(strings)
|
1078 | total_string_bytes = sum(strings$obj_len)
|
1079 |
|
1080 | strings %>% group_by(str_len) %>% count() %>% ungroup() %>%
|
1081 | mutate(n_less_than = cumsum(n),
|
1082 | percent = n_less_than * 100.0 / num_strings) ->
|
1083 | string_lengths
|
1084 |
|
1085 | strs_6_bytes_or_less = string_lengths %>% filter(str_len == 6) %>% select(percent)
|
1086 | strs_14_bytes_or_less = string_lengths %>% filter(str_len == 14) %>% select(percent)
|
1087 |
|
1088 | # Parse workload
|
1089 | # 62% of strings <= 6 bytes
|
1090 | # 84% of strings <= 14 bytes
|
1091 |
|
1092 | Log('Str - NewStr() and OverAllocatedStr()')
|
1093 | print(string_lengths %>% head(16))
|
1094 | print(string_lengths %>% tail(5))
|
1095 | Log('')
|
1096 |
|
1097 | Log('%s string allocations, total length = %s, total bytes = %s', commas(num_strings),
|
1098 | commas(sum(strings$str_len)), commas(total_string_bytes))
|
1099 | Log('')
|
1100 | Log('%.2f%% of allocs are strings', num_strings * 100 / num_allocs)
|
1101 | Log('%.2f%% of bytes are strings', total_string_bytes * 100 / total_bytes)
|
1102 | Log('')
|
1103 |
|
1104 | #
|
1105 | # Slabs
|
1106 | #
|
1107 |
|
1108 | Log('NewSlab()')
|
1109 |
|
1110 | num_slabs = nrow(slabs)
|
1111 | slabs %>% group_by(slab_len) %>% count() %>% ungroup() %>%
|
1112 | mutate(n_less_than = cumsum(n),
|
1113 | percent = n_less_than * 100.0 / num_slabs) ->
|
1114 | slab_lengths
|
1115 |
|
1116 | slabs %>% group_by(func_name) %>% count() %>% ungroup() %>%
|
1117 | arrange(desc(n)) -> slab_types
|
1118 |
|
1119 | Log(' Lengths')
|
1120 | print(slab_lengths %>% head())
|
1121 | print(slab_lengths %>% tail(5))
|
1122 | Log('')
|
1123 |
|
1124 | Log(' Slab Types')
|
1125 | print(slab_types %>% head())
|
1126 | print(slab_types %>% tail(5))
|
1127 | Log('')
|
1128 |
|
1129 | total_slab_items = sum(slabs$slab_len)
|
1130 |
|
1131 | Log('%s slabs, total items = %s', commas(num_slabs),
|
1132 | commas(sum(slabs$slab_len)))
|
1133 | Log('%.2f%% of allocs are slabs', num_slabs * 100 / num_allocs)
|
1134 | Log('')
|
1135 |
|
1136 | #
|
1137 | # reserve() calls
|
1138 | #
|
1139 |
|
1140 | # There should be strictly more List::reserve() calls than NewSlab
|
1141 |
|
1142 | Log('::reserve(int n)')
|
1143 | Log('')
|
1144 |
|
1145 | num_reserve = nrow(reserve)
|
1146 | reserve %>% group_by(num_items) %>% count() %>% ungroup() %>%
|
1147 | mutate(n_less_than = cumsum(n),
|
1148 | percent = n_less_than * 100.0 / num_reserve) ->
|
1149 | reserve_args
|
1150 |
|
1151 | Log(' Num Items')
|
1152 | print(reserve_args %>% head(15))
|
1153 | print(reserve_args %>% tail(5))
|
1154 | Log('')
|
1155 |
|
1156 | Log('%s reserve() calls, total items = %s', commas(num_reserve),
|
1157 | commas(sum(reserve$num_items)))
|
1158 | Log('')
|
1159 |
|
1160 | # Accounting for all allocations!
|
1161 | Log('Untyped: %s', commas(num_allocs))
|
1162 | Log('Typed + Str + Slab: %s', commas(num_typed + num_strings + num_slabs))
|
1163 | Log('')
|
1164 |
|
1165 | num_other_typed = num_typed - num_lists
|
1166 |
|
1167 | # Summary table
|
1168 | stats = tibble(task = task_name,
|
1169 | total_bytes_ = commas(total_bytes),
|
1170 | num_allocs_ = commas(num_allocs),
|
1171 | sum_typed_strs_slabs = commas(num_typed + num_strings + num_slabs),
|
1172 | num_reserve_calls = commas(num_reserve),
|
1173 |
|
1174 | percent_list_allocs = Percent(num_lists, num_allocs),
|
1175 | percent_slab_allocs = Percent(num_slabs, num_allocs),
|
1176 | percent_string_allocs = Percent(num_strings, num_allocs),
|
1177 | percent_other_typed_allocs = Percent(num_other_typed, num_allocs),
|
1178 |
|
1179 | percent_list_bytes = Percent(total_list_bytes, total_bytes),
|
1180 | percent_string_bytes = Percent(total_string_bytes, total_bytes),
|
1181 |
|
1182 | allocs_24_bytes_or_less = sprintf('%.1f%%', allocs_24_bytes_or_less),
|
1183 | allocs_48_bytes_or_less = sprintf('%.1f%%', allocs_48_bytes_or_less),
|
1184 | allocs_96_bytes_or_less = sprintf('%.1f%%', allocs_96_bytes_or_less),
|
1185 |
|
1186 | strs_6_bytes_or_less = sprintf('%.1f%%', strs_6_bytes_or_less),
|
1187 | strs_14_bytes_or_less = sprintf('%.1f%%', strs_14_bytes_or_less),
|
1188 | )
|
1189 | summaries$stats[[task_name]] = stats
|
1190 |
|
1191 | summaries$most_common_types[[task_name]] = most_common_types
|
1192 | }
|
1193 |
|
1194 | LoadUftraceTsv = function(in_dir, env) {
|
1195 | for (task in list.files(in_dir)) {
|
1196 | Log('Loading data for task %s', task)
|
1197 | base_dir = file.path(in_dir, task)
|
1198 |
|
1199 | task_env = new.env()
|
1200 | env[[task]] = task_env
|
1201 |
|
1202 | # TSV file, not CSV
|
1203 | task_env$untyped = readTsv(file.path(base_dir, 'all-untyped.tsv'))
|
1204 | task_env$typed = readTsv(file.path(base_dir, 'typed.tsv'))
|
1205 | task_env$strings = readTsv(file.path(base_dir, 'strings.tsv'))
|
1206 | task_env$slabs = readTsv(file.path(base_dir, 'slabs.tsv'))
|
1207 | task_env$reserve = readTsv(file.path(base_dir, 'reserve.tsv'))
|
1208 |
|
1209 | # median string length is 4, mean is 9.5!
|
1210 | Log('UNTYPED')
|
1211 | print(summary(task_env$untyped))
|
1212 | Log('')
|
1213 |
|
1214 | Log('TYPED')
|
1215 | print(summary(task_env$typed))
|
1216 | Log('')
|
1217 |
|
1218 | Log('STRINGS')
|
1219 | print(summary(task_env$strings))
|
1220 | Log('')
|
1221 |
|
1222 | Log('SLABS')
|
1223 | print(summary(task_env$slabs))
|
1224 | Log('')
|
1225 |
|
1226 | Log('RESERVE')
|
1227 | print(summary(task_env$reserve))
|
1228 | Log('')
|
1229 | }
|
1230 | }
|
1231 |
|
1232 | Percent = function(n, total) {
|
1233 | sprintf('%.1f%%', n * 100.0 / total)
|
1234 | }
|
1235 |
|
1236 | PrettyPrintLong = function(d) {
|
1237 | tr = t(d) # transpose
|
1238 |
|
1239 | row_names = rownames(tr)
|
1240 |
|
1241 | for (i in 1:nrow(tr)) {
|
1242 | row_name = row_names[i]
|
1243 | cat(sprintf('%26s', row_name)) # calculated min width manually
|
1244 | cat(sprintf('%20s', tr[i,]))
|
1245 | cat('\n')
|
1246 |
|
1247 | # Extra spacing
|
1248 | if (row_name %in% c('num_reserve_calls',
|
1249 | 'percent_string_bytes',
|
1250 | 'percent_other_typed_allocs',
|
1251 | 'allocs_96_bytes_or_less')) {
|
1252 | cat('\n')
|
1253 | }
|
1254 | }
|
1255 | }
|
1256 |
|
1257 |
|
1258 | UftraceReport = function(env, out_dir) {
|
1259 | # summaries$stats should be a list of 1-row data frames
|
1260 | # summaries$top_types should be a list of types
|
1261 | summaries = new.env()
|
1262 |
|
1263 | for (task_name in names(env)) {
|
1264 | report_out = file.path(out_dir, paste0(task_name, '.txt'))
|
1265 |
|
1266 | Log('Making report for task %s -> %s', task_name, report_out)
|
1267 |
|
1268 | sink(file = report_out)
|
1269 | UftraceTaskReport(env, task_name, summaries)
|
1270 | sink() # reset
|
1271 | }
|
1272 | Log('')
|
1273 |
|
1274 | # Concate all the data frames added to summary
|
1275 | stats = bind_rows(as.list(summaries$stats))
|
1276 |
|
1277 | sink(file = file.path(out_dir, 'summary.txt'))
|
1278 | #print(stats)
|
1279 | #Log('')
|
1280 |
|
1281 | PrettyPrintLong(stats)
|
1282 | Log('')
|
1283 |
|
1284 | mct = summaries$most_common_types
|
1285 | for (task_name in names(mct)) {
|
1286 | Log('Common types in workload %s', task_name)
|
1287 | Log('')
|
1288 |
|
1289 | print(mct[[task_name]] %>% head(5))
|
1290 | Log('')
|
1291 | }
|
1292 | sink()
|
1293 |
|
1294 | # For the REPL
|
1295 | return(list(stats = stats))
|
1296 | }
|
1297 |
|
1298 | main = function(argv) {
|
1299 | action = argv[[1]]
|
1300 | in_dir = argv[[2]]
|
1301 | out_dir = argv[[3]]
|
1302 |
|
1303 | if (action == 'osh-parser') {
|
1304 | ParserReport(in_dir, out_dir)
|
1305 |
|
1306 | } else if (action == 'osh-runtime') {
|
1307 | RuntimeReport(in_dir, out_dir)
|
1308 |
|
1309 | } else if (action == 'vm-baseline') {
|
1310 | VmBaselineReport(in_dir, out_dir)
|
1311 |
|
1312 | } else if (action == 'ovm-build') {
|
1313 | OvmBuildReport(in_dir, out_dir)
|
1314 |
|
1315 | } else if (action == 'compute') {
|
1316 | ComputeReport(in_dir, out_dir)
|
1317 |
|
1318 | } else if (action == 'gc') {
|
1319 | GcReport(in_dir, out_dir)
|
1320 |
|
1321 | } else if (action == 'gc-cachegrind') {
|
1322 | GcCachegrindReport(in_dir, out_dir)
|
1323 |
|
1324 | } else if (action == 'mycpp') {
|
1325 | MyCppReport(in_dir, out_dir)
|
1326 |
|
1327 | } else if (action == 'uftrace') {
|
1328 | d = new.env()
|
1329 | LoadUftraceTsv(in_dir, d)
|
1330 | UftraceReport(d, out_dir)
|
1331 |
|
1332 | } else {
|
1333 | Log("Invalid action '%s'", action)
|
1334 | quit(status = 1)
|
1335 | }
|
1336 | Log('PID %d done', Sys.getpid())
|
1337 | }
|
1338 |
|
1339 | if (length(sys.frames()) == 0) {
|
1340 | # increase ggplot font size globally
|
1341 | #theme_set(theme_grey(base_size = 20))
|
1342 |
|
1343 | main(commandArgs(TRUE))
|
1344 | }
|