1 | #!/usr/bin/env Rscript
|
2 | #
|
3 | # benchmarks/report.R -- Analyze data collected by shell scripts.
|
4 | #
|
5 | # Usage:
|
6 | # benchmarks/report.R OUT_DIR [TIMES_CSV...]
|
7 |
|
8 | # Suppress warnings about functions masked from 'package:stats' and 'package:base'
|
9 | # filter, lag
|
10 | # intersect, setdiff, setequal, union
|
11 | library(dplyr, warn.conflicts = FALSE)
|
12 | library(tidyr) # spread()
|
13 | library(stringr)
|
14 |
|
15 | source('benchmarks/common.R')
|
16 |
|
17 | options(stringsAsFactors = F)
|
18 |
|
19 | # For pretty printing
|
20 | commas = function(x) {
|
21 | format(x, big.mark=',')
|
22 | }
|
23 |
|
24 | sourceUrl = function(path) {
|
25 | sprintf('https://github.com/oilshell/oil/blob/master/%s', path)
|
26 | }
|
27 |
|
28 | # Takes a filename, not a path.
|
29 | sourceUrl2 = function(filename) {
|
30 | sprintf(
|
31 | 'https://github.com/oilshell/oil/blob/master/benchmarks/testdata/%s',
|
32 | filename)
|
33 | }
|
34 |
|
35 | mycppUrl = function(name) {
|
36 | sprintf('https://github.com/oilshell/oil/blob/master/mycpp/examples/%s.py', name)
|
37 | }
|
38 |
|
39 | genUrl = function(name) {
|
40 | sprintf('../../_gen/mycpp/examples/%s.mycpp.cc', name)
|
41 | }
|
42 |
|
43 |
|
44 | # TODO: Set up cgit because Github links are slow.
|
45 | benchmarkDataLink = function(subdir, name, suffix) {
|
46 | #sprintf('../../../../benchmark-data/shell-id/%s', shell_id)
|
47 | sprintf('https://github.com/oilshell/benchmark-data/blob/master/%s/%s%s',
|
48 | subdir, name, suffix)
|
49 | }
|
50 |
|
51 | provenanceLink = function(subdir, name, suffix) {
|
52 | sprintf('../%s/%s%s', subdir, name, suffix)
|
53 | }
|
54 |
|
55 |
|
56 | GetOshLabel = function(shell_hash, prov_dir) {
|
57 | ### Given a string, return another string.
|
58 |
|
59 | path = sprintf('%s/shell-id/osh-%s/sh-path.txt', prov_dir, shell_hash)
|
60 |
|
61 | if (file.exists(path)) {
|
62 | Log('Reading %s', path)
|
63 | lines = readLines(path)
|
64 | if (length(grep('_bin/osh', lines)) > 0) {
|
65 | label = 'osh-ovm'
|
66 | } else if (length(grep('bin/osh', lines)) > 0) {
|
67 | label = 'osh-cpython'
|
68 | } else if (length(grep('_bin/.*/osh', lines)) > 0) {
|
69 | label = 'osh-native'
|
70 | } else {
|
71 | stop("Expected _bin/osh, bin/osh, or _bin/.*/osh")
|
72 | }
|
73 | } else {
|
74 | stop(sprintf("%s doesn't exist", path))
|
75 | }
|
76 | return(label)
|
77 | }
|
78 |
|
79 | opt_suffix1 = '_bin/cxx-opt/osh'
|
80 | opt_suffix2 = '_bin/cxx-opt-sh/osh'
|
81 |
|
82 | ShellLabels = function(shell_name, shell_hash, num_hosts) {
|
83 | ### Given 2 vectors, return a vector of readable labels.
|
84 |
|
85 | # TODO: Clean up callers. Some metrics all this function with a
|
86 | # shell/runtime BASENAME, and others a PATH
|
87 | # - e.g. ComputeReport calls this with runtime_name which is actually a PATH
|
88 |
|
89 | #Log('name %s', shell_name)
|
90 | #Log('hash %s', shell_hash)
|
91 |
|
92 | if (num_hosts == 1) {
|
93 | prov_dir = '_tmp'
|
94 | } else {
|
95 | prov_dir = '../benchmark-data/'
|
96 | }
|
97 |
|
98 | labels = c()
|
99 | for (i in 1:length(shell_name)) {
|
100 | sh = shell_name[i]
|
101 | if (sh == 'osh') {
|
102 | label = GetOshLabel(shell_hash[i], prov_dir)
|
103 |
|
104 | } else if (endsWith(sh, opt_suffix1) || endsWith(sh, opt_suffix2)) {
|
105 | label = 'opt/osh'
|
106 |
|
107 | } else if (endsWith(sh, '_bin/cxx-opt+bumpleak/osh')) {
|
108 | label = 'bumpleak/osh'
|
109 |
|
110 | } else {
|
111 | label = sh
|
112 | }
|
113 |
|
114 | Log('[%s] [%s]', shell_name[i], label)
|
115 | labels = c(labels, label)
|
116 | }
|
117 |
|
118 | return(labels)
|
119 | }
|
120 |
|
121 | # Simple version of the above, used by benchmarks/gc
|
122 | ShellLabelFromPath = function(sh_path) {
|
123 | labels = c()
|
124 | for (i in 1:length(sh_path)) {
|
125 | sh = sh_path[i]
|
126 |
|
127 | if (endsWith(sh, opt_suffix1) || endsWith(sh, opt_suffix2)) {
|
128 | # the opt binary is osh-native
|
129 | label = 'osh-native'
|
130 |
|
131 | } else if (endsWith(sh, '_bin/cxx-opt+bumpleak/osh')) {
|
132 | label = 'bumpleak/osh'
|
133 |
|
134 | } else if (endsWith(sh, '_bin/osh')) { # the app bundle
|
135 | label = 'osh-ovm'
|
136 |
|
137 | } else if (endsWith(sh, 'bin/osh')) {
|
138 | label = 'osh-cpython'
|
139 |
|
140 | } else {
|
141 | label = sh
|
142 | }
|
143 | labels = c(labels, label)
|
144 | }
|
145 | return(labels)
|
146 | }
|
147 |
|
148 | DistinctHosts = function(t) {
|
149 | t %>% distinct(host_name, host_hash) -> distinct_hosts
|
150 | # The label is just the name
|
151 | distinct_hosts$host_label = distinct_hosts$host_name
|
152 | return(distinct_hosts)
|
153 | }
|
154 |
|
155 | DistinctShells = function(t, num_hosts = -1) {
|
156 | t %>% distinct(shell_name, shell_hash) -> distinct_shells
|
157 |
|
158 | Log('')
|
159 | Log('Labeling shells')
|
160 |
|
161 | # Calculate it if not passed
|
162 | if (num_hosts == -1) {
|
163 | num_hosts = nrow(DistinctHosts(t))
|
164 | }
|
165 |
|
166 | distinct_shells$shell_label = ShellLabels(distinct_shells$shell_name,
|
167 | distinct_shells$shell_hash,
|
168 | num_hosts)
|
169 | return(distinct_shells)
|
170 | }
|
171 |
|
172 | ParserReport = function(in_dir, out_dir) {
|
173 | times = read.csv(file.path(in_dir, 'times.csv'))
|
174 | lines = read.csv(file.path(in_dir, 'lines.csv'))
|
175 | raw_data = read.csv(file.path(in_dir, 'raw-data.csv'))
|
176 |
|
177 | cachegrind = readTsv(file.path(in_dir, 'cachegrind.tsv'))
|
178 |
|
179 | # For joining by filename
|
180 | lines_by_filename = tibble(
|
181 | num_lines = lines$num_lines,
|
182 | filename = basename(lines$path)
|
183 | )
|
184 |
|
185 | # Remove failures
|
186 | times %>% filter(status == 0) %>% select(-c(status)) -> times
|
187 | cachegrind %>% filter(status == 0) %>% select(-c(status)) -> cachegrind
|
188 |
|
189 | # Add the number of lines, joining on path, and compute lines/ms
|
190 | times %>%
|
191 | left_join(lines, by = c('path')) %>%
|
192 | mutate(filename = basename(path), filename_HREF = sourceUrl(path),
|
193 | max_rss_MB = max_rss_KiB * 1024 / 1e6,
|
194 | elapsed_ms = elapsed_secs * 1000,
|
195 | user_ms = user_secs * 1000,
|
196 | sys_ms = sys_secs * 1000,
|
197 | lines_per_ms = num_lines / elapsed_ms) %>%
|
198 | select(-c(path, max_rss_KiB, elapsed_secs, user_secs, sys_secs)) ->
|
199 | joined_times
|
200 |
|
201 | #print(head(times))
|
202 | #print(head(lines))
|
203 | #print(head(vm))
|
204 | #print(head(joined_times))
|
205 |
|
206 | print(summary(joined_times))
|
207 |
|
208 | #
|
209 | # Find distinct shells and hosts, and label them for readability.
|
210 | #
|
211 |
|
212 | distinct_hosts = DistinctHosts(joined_times)
|
213 | Log('')
|
214 | Log('Distinct hosts')
|
215 | print(distinct_hosts)
|
216 |
|
217 | distinct_shells = DistinctShells(joined_times)
|
218 | Log('')
|
219 | Log('Distinct shells')
|
220 | print(distinct_shells)
|
221 |
|
222 | # Replace name/hash combinations with labels.
|
223 | joined_times %>%
|
224 | left_join(distinct_hosts, by = c('host_name', 'host_hash')) %>%
|
225 | left_join(distinct_shells, by = c('shell_name', 'shell_hash')) %>%
|
226 | select(-c(host_name, host_hash, shell_name, shell_hash)) ->
|
227 | joined_times
|
228 |
|
229 | # Like 'times', but do shell_label as one step
|
230 | # Hack: we know benchmarks/auto.sh runs this on one machine
|
231 | distinct_shells_2 = DistinctShells(cachegrind, num_hosts = nrow(distinct_hosts))
|
232 | cachegrind %>%
|
233 | left_join(lines, by = c('path')) %>%
|
234 | select(-c(elapsed_secs, user_secs, sys_secs, max_rss_KiB)) %>%
|
235 | left_join(distinct_shells_2, by = c('shell_name', 'shell_hash')) %>%
|
236 | select(-c(shell_name, shell_hash)) %>%
|
237 | mutate(filename = basename(path), filename_HREF = sourceUrl(path)) %>%
|
238 | select(-c(path)) ->
|
239 | joined_cachegrind
|
240 |
|
241 | Log('summary(joined_times):')
|
242 | print(summary(joined_times))
|
243 | Log('head(joined_times):')
|
244 | print(head(joined_times))
|
245 |
|
246 | # Summarize rates by platform/shell
|
247 | joined_times %>%
|
248 | mutate(host_label = paste("host", host_label)) %>%
|
249 | group_by(host_label, shell_label) %>%
|
250 | summarize(total_lines = sum(num_lines), total_ms = sum(elapsed_ms)) %>%
|
251 | mutate(lines_per_ms = total_lines / total_ms) %>%
|
252 | select(-c(total_ms)) %>%
|
253 | spread(key = host_label, value = lines_per_ms) ->
|
254 | times_summary
|
255 |
|
256 | # Sort by parsing rate on the fast machine
|
257 | if ("host lenny" %in% colnames(times_summary)) {
|
258 | times_summary %>% arrange(desc(`host lenny`)) -> times_summary
|
259 | } else {
|
260 | times_summary %>% arrange(desc(`host no-host`)) -> times_summary
|
261 | }
|
262 |
|
263 | Log('times_summary:')
|
264 | print(times_summary)
|
265 |
|
266 | # Summarize cachegrind by platform/shell
|
267 | # Bug fix: as.numeric(irefs) avoids 32-bit integer overflow!
|
268 | joined_cachegrind %>%
|
269 | group_by(shell_label) %>%
|
270 | summarize(total_lines = sum(num_lines), total_irefs = sum(as.numeric(irefs))) %>%
|
271 | mutate(thousand_irefs_per_line = total_irefs / total_lines / 1000) %>%
|
272 | select(-c(total_irefs)) ->
|
273 | cachegrind_summary
|
274 |
|
275 | if ("no-host" %in% distinct_hosts$host_label) {
|
276 |
|
277 | # We don't have all the shells
|
278 | elapsed = NULL
|
279 | rate = NULL
|
280 | max_rss = NULL
|
281 | instructions = NULL
|
282 |
|
283 | joined_times %>%
|
284 | select(c(shell_label, elapsed_ms, user_ms, sys_ms, max_rss_MB,
|
285 | num_lines, filename, filename_HREF)) %>%
|
286 | arrange(filename, elapsed_ms) ->
|
287 | times_flat
|
288 |
|
289 | joined_cachegrind %>%
|
290 | select(c(shell_label, irefs, num_lines, filename, filename_HREF)) %>%
|
291 | arrange(filename, irefs) ->
|
292 | cachegrind_flat
|
293 |
|
294 | } else {
|
295 |
|
296 | times_flat = NULL
|
297 | cachegrind_flat = NULL
|
298 |
|
299 | # Elapsed seconds for each shell by platform and file
|
300 | joined_times %>%
|
301 | select(-c(lines_per_ms, user_ms, sys_ms, max_rss_MB)) %>%
|
302 | spread(key = shell_label, value = elapsed_ms) %>%
|
303 | arrange(host_label, num_lines) %>%
|
304 | mutate(osh_to_bash_ratio = `osh-native` / bash) %>%
|
305 | select(c(host_label, bash, dash, mksh, zsh,
|
306 | `osh-ovm`, `osh-cpython`, `osh-native`,
|
307 | osh_to_bash_ratio, num_lines, filename, filename_HREF)) ->
|
308 | elapsed
|
309 |
|
310 | Log('\n')
|
311 | Log('ELAPSED')
|
312 | print(elapsed)
|
313 |
|
314 | # Rates by file and shell
|
315 | joined_times %>%
|
316 | select(-c(elapsed_ms, user_ms, sys_ms, max_rss_MB)) %>%
|
317 | spread(key = shell_label, value = lines_per_ms) %>%
|
318 | arrange(host_label, num_lines) %>%
|
319 | select(c(host_label, bash, dash, mksh, zsh,
|
320 | `osh-ovm`, `osh-cpython`, `osh-native`,
|
321 | num_lines, filename, filename_HREF)) ->
|
322 | rate
|
323 |
|
324 | Log('\n')
|
325 | Log('RATE')
|
326 | print(rate)
|
327 |
|
328 | # Memory usage by file
|
329 | joined_times %>%
|
330 | select(-c(elapsed_ms, lines_per_ms, user_ms, sys_ms)) %>%
|
331 | spread(key = shell_label, value = max_rss_MB) %>%
|
332 | arrange(host_label, num_lines) %>%
|
333 | select(c(host_label, bash, dash, mksh, zsh,
|
334 | `osh-ovm`, `osh-cpython`, `osh-native`,
|
335 | num_lines, filename, filename_HREF)) ->
|
336 | max_rss
|
337 |
|
338 | Log('\n')
|
339 | Log('MAX RSS')
|
340 | print(max_rss)
|
341 |
|
342 | Log('\n')
|
343 | Log('joined_cachegrind has %d rows', nrow(joined_cachegrind))
|
344 | print(joined_cachegrind)
|
345 | #print(joined_cachegrind %>% filter(path == 'benchmarks/testdata/configure-helper.sh'))
|
346 |
|
347 | # Cachegrind instructions by file
|
348 | joined_cachegrind %>%
|
349 | mutate(thousand_irefs_per_line = irefs / num_lines / 1000) %>%
|
350 | select(-c(irefs)) %>%
|
351 | spread(key = shell_label, value = thousand_irefs_per_line) %>%
|
352 | arrange(num_lines) %>%
|
353 | select(c(bash, dash, mksh, `osh-native`,
|
354 | num_lines, filename, filename_HREF)) ->
|
355 | instructions
|
356 |
|
357 | Log('\n')
|
358 | Log('instructions has %d rows', nrow(instructions))
|
359 | print(instructions)
|
360 | }
|
361 |
|
362 | WriteProvenance(distinct_hosts, distinct_shells, out_dir)
|
363 |
|
364 | raw_data_table = tibble(
|
365 | filename = basename(as.character(raw_data$path)),
|
366 | filename_HREF = benchmarkDataLink('osh-parser', filename, '')
|
367 | )
|
368 | #print(raw_data_table)
|
369 |
|
370 | writeCsv(raw_data_table, file.path(out_dir, 'raw-data'))
|
371 |
|
372 | precision = SamePrecision(0) # lines per ms
|
373 | writeCsv(times_summary, file.path(out_dir, 'summary'), precision)
|
374 |
|
375 | precision = ColumnPrecision(list(), default = 1)
|
376 | writeTsv(cachegrind_summary, file.path(out_dir, 'cachegrind_summary'), precision)
|
377 |
|
378 | if (!is.null(times_flat)) {
|
379 | precision = SamePrecision(0)
|
380 | writeTsv(times_flat, file.path(out_dir, 'times_flat'), precision)
|
381 | }
|
382 |
|
383 | if (!is.null(cachegrind_flat)) {
|
384 | precision = SamePrecision(0)
|
385 | writeTsv(cachegrind_flat, file.path(out_dir, 'cachegrind_flat'), precision)
|
386 | }
|
387 |
|
388 | if (!is.null(elapsed)) { # equivalent to no-host
|
389 | # Round to nearest millisecond, but the ratio has a decimal point.
|
390 | precision = ColumnPrecision(list(osh_to_bash_ratio = 1), default = 0)
|
391 | writeCsv(elapsed, file.path(out_dir, 'elapsed'), precision)
|
392 |
|
393 | precision = SamePrecision(0)
|
394 | writeCsv(rate, file.path(out_dir, 'rate'), precision)
|
395 |
|
396 | writeCsv(max_rss, file.path(out_dir, 'max_rss'))
|
397 |
|
398 | precision = SamePrecision(1)
|
399 | writeTsv(instructions, file.path(out_dir, 'instructions'), precision)
|
400 | }
|
401 |
|
402 | Log('Wrote %s', out_dir)
|
403 | }
|
404 |
|
405 | WriteProvenance = function(distinct_hosts, distinct_shells, out_dir, tsv = F) {
|
406 |
|
407 | num_hosts = nrow(distinct_hosts)
|
408 | if (num_hosts == 1) {
|
409 | linkify = provenanceLink
|
410 | } else {
|
411 | linkify = benchmarkDataLink
|
412 | }
|
413 |
|
414 | Log('distinct_hosts')
|
415 | print(distinct_hosts)
|
416 | Log('')
|
417 |
|
418 | Log('distinct_shells')
|
419 | print(distinct_shells)
|
420 | Log('')
|
421 |
|
422 | # Should be:
|
423 | # host_id_url
|
424 | # And then csv_to_html will be smart enough? It should take --url flag?
|
425 | host_table = tibble(
|
426 | host_label = distinct_hosts$host_label,
|
427 | host_id = paste(distinct_hosts$host_name,
|
428 | distinct_hosts$host_hash, sep='-'),
|
429 | host_id_HREF = linkify('host-id', host_id, '/')
|
430 | )
|
431 | Log('host_table')
|
432 | print(host_table)
|
433 | Log('')
|
434 |
|
435 | shell_table = tibble(
|
436 | shell_label = distinct_shells$shell_label,
|
437 | shell_id = paste(distinct_shells$shell_name,
|
438 | distinct_shells$shell_hash, sep='-'),
|
439 | shell_id_HREF = linkify('shell-id', shell_id, '/')
|
440 | )
|
441 |
|
442 | Log('shell_table')
|
443 | print(shell_table)
|
444 | Log('')
|
445 |
|
446 | if (tsv) {
|
447 | writeTsv(host_table, file.path(out_dir, 'hosts'))
|
448 | writeTsv(shell_table, file.path(out_dir, 'shells'))
|
449 | } else {
|
450 | writeCsv(host_table, file.path(out_dir, 'hosts'))
|
451 | writeCsv(shell_table, file.path(out_dir, 'shells'))
|
452 | }
|
453 | }
|
454 |
|
455 | WriteSimpleProvenance = function(provenance, out_dir) {
|
456 | Log('provenance')
|
457 | print(provenance)
|
458 | Log('')
|
459 |
|
460 | # Legacy: add $shell_name, because "$shell_basename-$shell_hash" is what
|
461 | # benchmarks/id.sh publish-shell-id uses
|
462 | provenance %>%
|
463 | mutate(shell_name = basename(sh_path)) %>%
|
464 | distinct(shell_label, shell_name, shell_hash) ->
|
465 | distinct_shells
|
466 |
|
467 | Log('distinct_shells')
|
468 | print(distinct_shells)
|
469 | Log('')
|
470 |
|
471 | provenance %>% distinct(host_label, host_name, host_hash) -> distinct_hosts
|
472 |
|
473 | WriteProvenance(distinct_hosts, distinct_shells, out_dir, tsv = T)
|
474 | }
|
475 |
|
476 | RuntimeReport = function(in_dir, out_dir) {
|
477 | times = readTsv(file.path(in_dir, 'times.tsv'))
|
478 |
|
479 | gc_stats = readTsv(file.path(in_dir, 'gc_stats.tsv'))
|
480 | provenance = readTsv(file.path(in_dir, 'provenance.tsv'))
|
481 |
|
482 | times %>% filter(status != 0) -> failed
|
483 | if (nrow(failed) != 0) {
|
484 | print(failed)
|
485 | stop('Some osh-runtime tasks failed')
|
486 | }
|
487 |
|
488 | # Joins:
|
489 | # times <= sh_path => provenance
|
490 | # times <= join_id, host_name => gc_stats
|
491 |
|
492 | # TODO: provenance may have rows from 2 machines. Could validate them and
|
493 | # deduplicate.
|
494 |
|
495 | # It should have (host_label, host_name, host_hash)
|
496 | # (shell_label, sh_path, shell_hash)
|
497 | provenance %>%
|
498 | mutate(host_label = host_name, shell_label = ShellLabelFromPath(sh_path)) ->
|
499 | provenance
|
500 |
|
501 | provenance %>% distinct(sh_path, shell_label) -> label_lookup
|
502 |
|
503 | Log('label_lookup')
|
504 | print(label_lookup)
|
505 |
|
506 | # Join with provenance for host label and shell label
|
507 | times %>%
|
508 | select(c(elapsed_secs, user_secs, sys_secs, max_rss_KiB, task_id,
|
509 | host_name, sh_path, workload)) %>%
|
510 | mutate(elapsed_ms = elapsed_secs * 1000,
|
511 | user_ms = user_secs * 1000,
|
512 | sys_ms = sys_secs * 1000,
|
513 | max_rss_MB = max_rss_KiB * 1024 / 1e6) %>%
|
514 | select(-c(elapsed_secs, user_secs, sys_secs, max_rss_KiB)) %>%
|
515 | left_join(label_lookup, by = c('sh_path')) %>%
|
516 | select(-c(sh_path)) %>%
|
517 | # we want to compare workloads on adjacent rows
|
518 | arrange(workload) ->
|
519 | details
|
520 |
|
521 | times %>%
|
522 | select(c(task_id, host_name, sh_path, workload, minor_faults, major_faults, swaps, in_block, out_block, signals, voluntary_ctx, involuntary_ctx)) %>%
|
523 | left_join(label_lookup, by = c('sh_path')) %>%
|
524 | select(-c(sh_path)) %>%
|
525 | # we want to compare workloads on adjacent rows
|
526 | arrange(workload) ->
|
527 | details_io
|
528 |
|
529 | Log('details')
|
530 | print(details)
|
531 |
|
532 | # Elapsed time comparison
|
533 | details %>%
|
534 | select(-c(task_id, user_ms, sys_ms, max_rss_MB)) %>%
|
535 | spread(key = shell_label, value = elapsed_ms) %>%
|
536 | mutate(py_bash_ratio = `osh-cpython` / bash) %>%
|
537 | mutate(native_bash_ratio = `osh-native` / bash) %>%
|
538 | arrange(workload, host_name) %>%
|
539 | select(c(workload, host_name,
|
540 | bash, dash, `osh-cpython`, `osh-native`,
|
541 | py_bash_ratio, native_bash_ratio)) ->
|
542 |
|
543 | elapsed
|
544 |
|
545 | Log('elapsed')
|
546 | print(elapsed)
|
547 |
|
548 | # Minor Page Faults Comparison
|
549 | details_io %>%
|
550 | select(c(host_name, shell_label, workload, minor_faults)) %>%
|
551 | spread(key = shell_label, value = minor_faults) %>%
|
552 | mutate(py_bash_ratio = `osh-cpython` / bash) %>%
|
553 | mutate(native_bash_ratio = `osh-native` / bash) %>%
|
554 | arrange(workload, host_name) %>%
|
555 | select(c(workload, host_name,
|
556 | bash, dash, `osh-cpython`, `osh-native`,
|
557 | py_bash_ratio, native_bash_ratio)) ->
|
558 | page_faults
|
559 |
|
560 | Log('page_faults')
|
561 | print(page_faults)
|
562 |
|
563 | # Max RSS comparison
|
564 | details %>%
|
565 | select(c(host_name, shell_label, workload, max_rss_MB)) %>%
|
566 | spread(key = shell_label, value = max_rss_MB) %>%
|
567 | mutate(py_bash_ratio = `osh-cpython` / bash) %>%
|
568 | mutate(native_bash_ratio = `osh-native` / bash) %>%
|
569 | arrange(workload, host_name) %>%
|
570 | select(c(workload, host_name,
|
571 | bash, dash, `osh-cpython`, `osh-native`,
|
572 | py_bash_ratio, native_bash_ratio)) ->
|
573 | max_rss
|
574 |
|
575 | Log('max rss')
|
576 | print(max_rss)
|
577 |
|
578 | details %>%
|
579 | select(c(task_id, host_name, workload, elapsed_ms, max_rss_MB)) %>%
|
580 | mutate(join_id = sprintf("gc-%d", task_id)) %>%
|
581 | select(-c(task_id)) ->
|
582 | gc_details
|
583 |
|
584 | Log('GC details')
|
585 | print(gc_details)
|
586 | Log('')
|
587 |
|
588 | Log('GC stats')
|
589 | print(gc_stats)
|
590 | Log('')
|
591 |
|
592 | gc_stats %>%
|
593 | left_join(gc_details, by = c('join_id', 'host_name')) %>%
|
594 | select(-c(join_id, roots_capacity, objs_capacity)) %>%
|
595 | # Do same transformations as GcReport()
|
596 | mutate(allocated_MB = bytes_allocated / 1e6) %>%
|
597 | select(-c(bytes_allocated)) %>%
|
598 | rename(num_gc_done = num_collections) %>%
|
599 | # Put these columns first
|
600 | relocate(workload, host_name,
|
601 | elapsed_ms, max_gc_millis, total_gc_millis,
|
602 | allocated_MB, max_rss_MB, num_allocated) ->
|
603 | gc_stats
|
604 |
|
605 | Log('After GC stats')
|
606 | print(gc_stats)
|
607 | Log('')
|
608 |
|
609 | WriteSimpleProvenance(provenance, out_dir)
|
610 |
|
611 | # milliseconds don't need decimal digit
|
612 | precision = ColumnPrecision(list(bash = 0, dash = 0, `osh-cpython` = 0,
|
613 | `osh-native` = 0, py_bash_ratio = 2,
|
614 | native_bash_ratio = 2))
|
615 | writeTsv(elapsed, file.path(out_dir, 'elapsed'), precision)
|
616 | writeTsv(page_faults, file.path(out_dir, 'page_faults'), precision)
|
617 |
|
618 | precision2 = ColumnPrecision(list(py_bash_ratio = 2, native_bash_ratio = 2))
|
619 | writeTsv(max_rss, file.path(out_dir, 'max_rss'), precision2)
|
620 |
|
621 | precision3 = ColumnPrecision(list(max_rss_MB = 1, allocated_MB = 1),
|
622 | default = 0)
|
623 | writeTsv(gc_stats, file.path(out_dir, 'gc_stats'), precision3)
|
624 |
|
625 | writeTsv(details, file.path(out_dir, 'details'), precision3)
|
626 | writeTsv(details_io, file.path(out_dir, 'details_io'))
|
627 |
|
628 | Log('Wrote %s', out_dir)
|
629 | }
|
630 |
|
631 | VmBaselineReport = function(in_dir, out_dir) {
|
632 | vm = readTsv(file.path(in_dir, 'vm-baseline.tsv'))
|
633 | #print(vm)
|
634 |
|
635 | # Not using DistinctHosts() because field host_hash isn't collected
|
636 | num_hosts = nrow(vm %>% distinct(host))
|
637 |
|
638 | vm %>%
|
639 | rename(kib = metric_value) %>%
|
640 | mutate(shell_label = ShellLabels(shell_name, shell_hash, num_hosts),
|
641 | megabytes = kib * 1024 / 1e6) %>%
|
642 | select(-c(shell_name, kib)) %>%
|
643 | spread(key = c(metric_name), value = megabytes) %>%
|
644 | rename(VmPeak_MB = VmPeak, VmRSS_MB = VmRSS) %>%
|
645 | select(c(shell_label, shell_hash, host, VmRSS_MB, VmPeak_MB)) %>%
|
646 | arrange(shell_label, shell_hash, host, VmPeak_MB) ->
|
647 | vm
|
648 |
|
649 | print(vm)
|
650 |
|
651 | writeTsv(vm, file.path(out_dir, 'vm-baseline'))
|
652 | }
|
653 |
|
654 | WriteOvmBuildDetails = function(distinct_hosts, distinct_compilers, out_dir) {
|
655 | host_table = tibble(
|
656 | host_label = distinct_hosts$host_label,
|
657 | host_id = paste(distinct_hosts$host_name,
|
658 | distinct_hosts$host_hash, sep='-'),
|
659 | host_id_HREF = benchmarkDataLink('host-id', host_id, '/')
|
660 | )
|
661 | print(host_table)
|
662 |
|
663 | dc = distinct_compilers
|
664 | compiler_table = tibble(
|
665 | compiler_label = dc$compiler_label,
|
666 | compiler_id = paste(dc$compiler_label, dc$compiler_hash, sep='-'),
|
667 | compiler_id_HREF = benchmarkDataLink('compiler-id', compiler_id, '/')
|
668 | )
|
669 | print(compiler_table)
|
670 |
|
671 | writeTsv(host_table, file.path(out_dir, 'hosts'))
|
672 | writeTsv(compiler_table, file.path(out_dir, 'compilers'))
|
673 | }
|
674 |
|
675 | OvmBuildReport = function(in_dir, out_dir) {
|
676 | times = readTsv(file.path(in_dir, 'times.tsv'))
|
677 | bytecode_size = readTsv(file.path(in_dir, 'bytecode-size.tsv'))
|
678 | bin_sizes = readTsv(file.path(in_dir, 'bin-sizes.tsv'))
|
679 | native_sizes = readTsv(file.path(in_dir, 'native-sizes.tsv'))
|
680 | raw_data = readTsv(file.path(in_dir, 'raw-data.tsv'))
|
681 |
|
682 | times %>% filter(status != 0) -> failed
|
683 | if (nrow(failed) != 0) {
|
684 | print(failed)
|
685 | stop('Some ovm-build tasks failed')
|
686 | }
|
687 |
|
688 | times %>% distinct(host_name, host_hash) -> distinct_hosts
|
689 | distinct_hosts$host_label = distinct_hosts$host_name
|
690 |
|
691 | times %>% distinct(compiler_path, compiler_hash) -> distinct_compilers
|
692 | distinct_compilers$compiler_label = basename(distinct_compilers$compiler_path)
|
693 |
|
694 | #print(distinct_hosts)
|
695 | #print(distinct_compilers)
|
696 |
|
697 | WriteOvmBuildDetails(distinct_hosts, distinct_compilers, out_dir)
|
698 |
|
699 | times %>%
|
700 | select(-c(status)) %>%
|
701 | left_join(distinct_hosts, by = c('host_name', 'host_hash')) %>%
|
702 | left_join(distinct_compilers, by = c('compiler_path', 'compiler_hash')) %>%
|
703 | select(-c(host_name, host_hash, compiler_path, compiler_hash)) %>%
|
704 | mutate(src_dir = basename(src_dir),
|
705 | host_label = paste("host ", host_label),
|
706 | is_conf = str_detect(action, 'configure'),
|
707 | is_ovm = str_detect(action, 'oil.ovm'),
|
708 | is_dbg = str_detect(action, 'dbg'),
|
709 | ) %>%
|
710 | select(host_label, src_dir, compiler_label, action, is_conf, is_ovm, is_dbg,
|
711 | elapsed_secs) %>%
|
712 | spread(key = c(host_label), value = elapsed_secs) %>%
|
713 | arrange(src_dir, compiler_label, desc(is_conf), is_ovm, desc(is_dbg)) %>%
|
714 | select(-c(is_conf, is_ovm, is_dbg)) ->
|
715 | times
|
716 |
|
717 | #print(times)
|
718 |
|
719 | bytecode_size %>%
|
720 | rename(bytecode_size = num_bytes) %>%
|
721 | select(-c(path)) ->
|
722 | bytecode_size
|
723 |
|
724 | bin_sizes %>%
|
725 | # reorder
|
726 | select(c(host_label, path, num_bytes)) %>%
|
727 | left_join(bytecode_size, by = c('host_label')) %>%
|
728 | mutate(native_code_size = num_bytes - bytecode_size) ->
|
729 | sizes
|
730 |
|
731 | # paths look like _tmp/ovm-build/bin/clang/oils_cpp.stripped
|
732 | native_sizes %>%
|
733 | select(c(host_label, path, num_bytes)) %>%
|
734 | mutate(host_label = paste("host ", host_label),
|
735 | binary = basename(path),
|
736 | compiler = basename(dirname(path)),
|
737 | ) %>%
|
738 | select(-c(path)) %>%
|
739 | spread(key = c(host_label), value = num_bytes) %>%
|
740 | arrange(compiler, binary) ->
|
741 | native_sizes
|
742 |
|
743 | # NOTE: These don't have the host and compiler.
|
744 | writeTsv(times, file.path(out_dir, 'times'))
|
745 | writeTsv(bytecode_size, file.path(out_dir, 'bytecode-size'))
|
746 | writeTsv(sizes, file.path(out_dir, 'sizes'))
|
747 | writeTsv(native_sizes, file.path(out_dir, 'native-sizes'))
|
748 |
|
749 | # TODO: I want a size report too
|
750 | #writeCsv(sizes, file.path(out_dir, 'sizes'))
|
751 | }
|
752 |
|
753 | unique_stdout_md5sum = function(t, num_expected) {
|
754 | u = n_distinct(t$stdout_md5sum)
|
755 | if (u != num_expected) {
|
756 | t %>% select(c(host_name, task_name, arg1, arg2, runtime_name, stdout_md5sum)) %>% print()
|
757 | stop(sprintf('Expected %d unique md5sums, got %d', num_expected, u))
|
758 | }
|
759 | }
|
760 |
|
761 | ComputeReport = function(in_dir, out_dir) {
|
762 | # TSV file, not CSV
|
763 | times = read.table(file.path(in_dir, 'times.tsv'), header=T)
|
764 | print(times)
|
765 |
|
766 | times %>% filter(status != 0) -> failed
|
767 | if (nrow(failed) != 0) {
|
768 | print(failed)
|
769 | stop('Some compute tasks failed')
|
770 | }
|
771 |
|
772 | #
|
773 | # Check correctness
|
774 | #
|
775 |
|
776 | times %>% filter(task_name == 'hello') %>% unique_stdout_md5sum(1)
|
777 | times %>% filter(task_name == 'fib') %>% unique_stdout_md5sum(1)
|
778 | times %>% filter(task_name == 'word_freq') %>% unique_stdout_md5sum(1)
|
779 | # 3 different inputs
|
780 | times %>% filter(task_name == 'parse_help') %>% unique_stdout_md5sum(3)
|
781 |
|
782 | times %>% filter(task_name == 'bubble_sort') %>% unique_stdout_md5sum(2)
|
783 |
|
784 | # TODO:
|
785 | # - oils_cpp doesn't implement unicode LANG=C
|
786 | # - bash behaves differently on your desktop vs. in the container
|
787 | # - might need layer-locales in the image?
|
788 |
|
789 | #times %>% filter(task_name == 'palindrome' & arg1 == 'unicode') %>% unique_stdout_md5sum(1)
|
790 | # Ditto here
|
791 | #times %>% filter(task_name == 'palindrome' & arg1 == 'bytes') %>% unique_stdout_md5sum(1)
|
792 |
|
793 | #
|
794 | # Find distinct shells and hosts, and label them for readability.
|
795 | #
|
796 |
|
797 | # Runtimes are called shells, as a hack for code reuse
|
798 | times %>%
|
799 | mutate(shell_name = runtime_name, shell_hash = runtime_hash) %>%
|
800 | select(c(host_name, host_hash, shell_name, shell_hash)) ->
|
801 | tmp
|
802 |
|
803 | distinct_hosts = DistinctHosts(tmp)
|
804 | Log('')
|
805 | Log('Distinct hosts')
|
806 | print(distinct_hosts)
|
807 |
|
808 | distinct_shells = DistinctShells(tmp)
|
809 | Log('')
|
810 | Log('Distinct runtimes')
|
811 | print(distinct_shells)
|
812 |
|
813 | num_hosts = nrow(distinct_hosts)
|
814 |
|
815 | times %>%
|
816 | select(-c(status, stdout_md5sum, stdout_filename, host_hash, runtime_hash)) %>%
|
817 | mutate(runtime_label = ShellLabels(runtime_name, runtime_hash, num_hosts),
|
818 | elapsed_ms = elapsed_secs * 1000,
|
819 | user_ms = user_secs * 1000,
|
820 | sys_ms = sys_secs * 1000,
|
821 | max_rss_MB = max_rss_KiB * 1024 / 1e6) %>%
|
822 | select(-c(runtime_name, elapsed_secs, user_secs, sys_secs, max_rss_KiB)) %>%
|
823 | arrange(host_name, task_name, arg1, arg2, user_ms) ->
|
824 | details
|
825 |
|
826 | times %>%
|
827 | mutate(
|
828 | runtime_label = ShellLabels(runtime_name, runtime_hash, num_hosts),
|
829 | stdout_md5sum_HREF = file.path('tmp', task_name, stdout_filename)) %>%
|
830 | select(c(host_name, task_name, arg1, arg2, runtime_label,
|
831 | stdout_md5sum, stdout_md5sum_HREF)) ->
|
832 | stdout_files
|
833 |
|
834 | details %>% filter(task_name == 'hello') %>% select(-c(task_name)) -> hello
|
835 | details %>% filter(task_name == 'fib') %>% select(-c(task_name)) -> fib
|
836 | details %>% filter(task_name == 'word_freq') %>% select(-c(task_name)) -> word_freq
|
837 | # There's no arg2
|
838 | details %>% filter(task_name == 'parse_help') %>% select(-c(task_name, arg2)) -> parse_help
|
839 |
|
840 | details %>% filter(task_name == 'bubble_sort') %>% select(-c(task_name)) -> bubble_sort
|
841 | details %>% filter(task_name == 'palindrome' & arg1 == 'unicode') %>% select(-c(task_name)) -> palindrome
|
842 |
|
843 | precision = ColumnPrecision(list(max_rss_MB = 1), default = 0)
|
844 | writeTsv(details, file.path(out_dir, 'details'), precision)
|
845 |
|
846 | writeTsv(stdout_files, file.path(out_dir, 'stdout_files'), precision)
|
847 |
|
848 | writeTsv(hello, file.path(out_dir, 'hello'), precision)
|
849 | writeTsv(fib, file.path(out_dir, 'fib'), precision)
|
850 | writeTsv(word_freq, file.path(out_dir, 'word_freq'), precision)
|
851 | writeTsv(parse_help, file.path(out_dir, 'parse_help'), precision)
|
852 |
|
853 | writeTsv(bubble_sort, file.path(out_dir, 'bubble_sort'), precision)
|
854 | writeTsv(palindrome, file.path(out_dir, 'palindrome'), precision)
|
855 |
|
856 | WriteProvenance(distinct_hosts, distinct_shells, out_dir, tsv = T)
|
857 | }
|
858 |
|
859 | WriteOneTask = function(times, out_dir, task_name, precision) {
|
860 | times %>%
|
861 | filter(task == task_name) %>%
|
862 | select(-c(task)) -> subset
|
863 |
|
864 | writeTsv(subset, file.path(out_dir, task_name), precision)
|
865 | }
|
866 |
|
867 | SHELL_ORDER = c('dash',
|
868 | 'bash',
|
869 | 'zsh',
|
870 | '_bin/cxx-opt+bumpleak/osh',
|
871 | '_bin/cxx-opt+bumproot/osh',
|
872 | '_bin/cxx-opt+bumpsmall/osh',
|
873 | '_bin/cxx-opt/osh',
|
874 | '_bin/cxx-opt+nopool/osh')
|
875 |
|
876 | GcReport = function(in_dir, out_dir) {
|
877 | times = read.table(file.path(in_dir, 'raw/times.tsv'), header=T)
|
878 | gc_stats = read.table(file.path(in_dir, 'stage1/gc_stats.tsv'), header=T)
|
879 |
|
880 | times %>% filter(status != 0) -> failed
|
881 | if (nrow(failed) != 0) {
|
882 | print(failed)
|
883 | stop('Some gc tasks failed')
|
884 | }
|
885 |
|
886 | # Change units and order columns
|
887 | times %>%
|
888 | arrange(task, factor(sh_path, levels = SHELL_ORDER)) %>%
|
889 | mutate(elapsed_ms = elapsed_secs * 1000,
|
890 | user_ms = user_secs * 1000,
|
891 | sys_ms = sys_secs * 1000,
|
892 | max_rss_MB = max_rss_KiB * 1024 / 1e6,
|
893 | shell_label = ShellLabelFromPath(sh_path)
|
894 | ) %>%
|
895 | select(c(join_id, task, elapsed_ms, user_ms, sys_ms, max_rss_MB, shell_label,
|
896 | shell_runtime_opts)) ->
|
897 | times
|
898 |
|
899 | # Join and order columns
|
900 | gc_stats %>% left_join(times, by = c('join_id')) %>%
|
901 | arrange(desc(task)) %>%
|
902 | mutate(allocated_MB = bytes_allocated / 1e6) %>%
|
903 | # try to make the table skinnier
|
904 | rename(num_gc_done = num_collections) %>%
|
905 | select(task, elapsed_ms, max_gc_millis, total_gc_millis,
|
906 | allocated_MB, max_rss_MB, num_allocated,
|
907 | num_gc_points, num_gc_done, gc_threshold, num_growths, max_survived,
|
908 | shell_label) ->
|
909 | gc_stats
|
910 |
|
911 | times %>% select(-c(join_id)) -> times
|
912 |
|
913 |
|
914 | precision = ColumnPrecision(list(max_rss_MB = 1, allocated_MB = 1),
|
915 | default = 0)
|
916 |
|
917 | writeTsv(times, file.path(out_dir, 'times'), precision)
|
918 | writeTsv(gc_stats, file.path(out_dir, 'gc_stats'), precision)
|
919 |
|
920 | tasks = c('parse.configure-coreutils',
|
921 | 'parse.configure-cpython',
|
922 | 'parse.abuild',
|
923 | 'ex.compute-fib',
|
924 | 'ex.bashcomp-parse-help',
|
925 | 'ex.abuild-print-help')
|
926 | # Write out separate rows
|
927 | for (task in tasks) {
|
928 | WriteOneTask(times, out_dir, task, precision)
|
929 | }
|
930 | }
|
931 |
|
932 | GcCachegrindReport = function(in_dir, out_dir) {
|
933 | times = readTsv(file.path(in_dir, 'raw/times.tsv'))
|
934 | counts = readTsv(file.path(in_dir, 'stage1/cachegrind.tsv'))
|
935 |
|
936 | times %>% filter(status != 0) -> failed
|
937 | if (nrow(failed) != 0) {
|
938 | print(failed)
|
939 | stop('Some gc tasks failed')
|
940 | }
|
941 |
|
942 | print(times)
|
943 | print(counts)
|
944 |
|
945 | counts %>% left_join(times, by = c('join_id')) %>%
|
946 | mutate(million_irefs = irefs / 1e6) %>%
|
947 | select(c(million_irefs, task, sh_path, shell_runtime_opts)) %>%
|
948 | arrange(factor(sh_path, levels = SHELL_ORDER)) ->
|
949 | counts
|
950 |
|
951 | precision = NULL
|
952 | tasks = c('parse.abuild', 'ex.compute-fib')
|
953 | for (task in tasks) {
|
954 | WriteOneTask(counts, out_dir, task, precision)
|
955 | }
|
956 | }
|
957 |
|
958 | MyCppReport = function(in_dir, out_dir) {
|
959 | times = readTsv(file.path(in_dir, 'benchmark-table.tsv'))
|
960 | print(times)
|
961 |
|
962 | times %>% filter(status != 0) -> failed
|
963 | if (nrow(failed) != 0) {
|
964 | print(failed)
|
965 | stop('Some mycpp tasks failed')
|
966 | }
|
967 |
|
968 | # Don't care about elapsed and system
|
969 | times %>% select(-c(status, elapsed_secs, bin, task_out)) %>%
|
970 | mutate(example_name_HREF = mycppUrl(example_name),
|
971 | gen = c('gen'),
|
972 | gen_HREF = genUrl(example_name),
|
973 | user_ms = user_secs * 1000,
|
974 | sys_ms = sys_secs * 1000,
|
975 | max_rss_MB = max_rss_KiB * 1024 / 1e6) %>%
|
976 | select(-c(user_secs, sys_secs, max_rss_KiB)) ->
|
977 | details
|
978 |
|
979 | details %>% select(-c(sys_ms, max_rss_MB)) %>%
|
980 | spread(key = impl, value = user_ms) %>%
|
981 | mutate(`C++ : Python` = `C++` / Python) %>%
|
982 | arrange(`C++ : Python`) ->
|
983 | user_time
|
984 |
|
985 | details %>% select(-c(user_ms, max_rss_MB)) %>%
|
986 | spread(key = impl, value = sys_ms) %>%
|
987 | mutate(`C++ : Python` = `C++` / Python) %>%
|
988 | arrange(`C++ : Python`) ->
|
989 | sys_time
|
990 |
|
991 | details %>% select(-c(user_ms, sys_ms)) %>%
|
992 | spread(key = impl, value = max_rss_MB) %>%
|
993 | mutate(`C++ : Python` = `C++` / Python) %>%
|
994 | arrange(`C++ : Python`) ->
|
995 | max_rss
|
996 |
|
997 | # Sometimes it speeds up by more than 10x
|
998 | precision1 = ColumnPrecision(list(`C++ : Python` = 3), default = 0)
|
999 | writeTsv(user_time, file.path(out_dir, 'user_time'), precision1)
|
1000 | writeTsv(sys_time, file.path(out_dir, 'sys_time'), precision1)
|
1001 |
|
1002 | precision2 = ColumnPrecision(list(`C++ : Python` = 2), default = 1)
|
1003 | writeTsv(max_rss, file.path(out_dir, 'max_rss'), precision2)
|
1004 |
|
1005 | writeTsv(details, file.path(out_dir, 'details'))
|
1006 | }
|
1007 |
|
1008 | UftraceTaskReport = function(env, task_name, summaries) {
|
1009 | # Need this again after redirect
|
1010 | MaybeDisableColor(stdout())
|
1011 |
|
1012 | task_env = env[[task_name]]
|
1013 |
|
1014 | untyped = task_env$untyped
|
1015 | typed = task_env$typed
|
1016 | strings = task_env$strings
|
1017 | slabs = task_env$slabs
|
1018 | reserve = task_env$reserve
|
1019 |
|
1020 | string_overhead = 17 # GC header (8) + len (4) + hash value (4) + NUL (1)
|
1021 | strings %>% mutate(obj_len = str_len + string_overhead) -> strings
|
1022 |
|
1023 | # TODO: Output these totals PER WORKLOAD, e.g. parsing big/small, executing
|
1024 | # big/small
|
1025 | #
|
1026 | # And then zoom in on distributions as well
|
1027 |
|
1028 | num_allocs = nrow(untyped)
|
1029 | total_bytes = sum(untyped$obj_len)
|
1030 |
|
1031 | untyped %>% group_by(obj_len) %>% count() %>% ungroup() -> untyped_hist
|
1032 | #print(untyped_hist)
|
1033 |
|
1034 | untyped_hist %>%
|
1035 | mutate(n_less_than = cumsum(n),
|
1036 | percent = n_less_than * 100.0 / num_allocs) ->
|
1037 | alloc_sizes
|
1038 |
|
1039 | a24 = untyped_hist %>% filter(obj_len <= 24)
|
1040 | a48 = untyped_hist %>% filter(obj_len <= 48)
|
1041 | a96 = untyped_hist %>% filter(obj_len <= 96)
|
1042 |
|
1043 | allocs_24_bytes_or_less = sum(a24$n) * 100.0 / num_allocs
|
1044 | allocs_48_bytes_or_less = sum(a48$n) * 100.0 / num_allocs
|
1045 | allocs_96_bytes_or_less = sum(a96$n) * 100.0 / num_allocs
|
1046 |
|
1047 | Log('Percentage of allocs less than 48 bytes: %.1f', allocs_48_bytes_or_less)
|
1048 |
|
1049 | options(tibble.print_min=25)
|
1050 |
|
1051 | Log('')
|
1052 | Log('All allocations')
|
1053 | print(alloc_sizes %>% head(22))
|
1054 | print(alloc_sizes %>% tail(5))
|
1055 |
|
1056 | Log('')
|
1057 | Log('Common Sizes')
|
1058 | print(untyped_hist %>% arrange(desc(n)) %>% head(8))
|
1059 |
|
1060 | Log('')
|
1061 | Log(' %s total allocations, total bytes = %s', commas(num_allocs), commas(total_bytes))
|
1062 | Log('')
|
1063 |
|
1064 | Log('Typed allocations')
|
1065 |
|
1066 | num_typed = nrow(typed)
|
1067 |
|
1068 | typed %>% group_by(func_name) %>% count() %>% ungroup() %>%
|
1069 | mutate(percent = n * 100.0 / num_typed) %>%
|
1070 | arrange(desc(n)) -> most_common_types
|
1071 |
|
1072 | print(most_common_types %>% head(20))
|
1073 | print(most_common_types %>% tail(5))
|
1074 |
|
1075 | lists = typed %>% filter(str_starts(func_name, ('List<')))
|
1076 | #print(lists)
|
1077 |
|
1078 | num_lists = nrow(lists)
|
1079 | total_list_bytes = num_lists * 24 # sizeof List<T> head is hard-coded
|
1080 |
|
1081 | Log('')
|
1082 | Log('%s typed allocs, including %s List<T>', commas(num_typed), commas(num_lists))
|
1083 | Log('%.2f%% of allocs are typed', num_typed * 100 / num_allocs)
|
1084 | Log('')
|
1085 |
|
1086 | #
|
1087 | # Strings
|
1088 | #
|
1089 |
|
1090 | num_strings = nrow(strings)
|
1091 | total_string_bytes = sum(strings$obj_len)
|
1092 |
|
1093 | strings %>% group_by(str_len) %>% count() %>% ungroup() %>%
|
1094 | mutate(n_less_than = cumsum(n),
|
1095 | percent = n_less_than * 100.0 / num_strings) ->
|
1096 | string_lengths
|
1097 |
|
1098 | strs_6_bytes_or_less = string_lengths %>% filter(str_len == 6) %>% select(percent)
|
1099 | strs_14_bytes_or_less = string_lengths %>% filter(str_len == 14) %>% select(percent)
|
1100 |
|
1101 | # Parse workload
|
1102 | # 62% of strings <= 6 bytes
|
1103 | # 84% of strings <= 14 bytes
|
1104 |
|
1105 | Log('Str - NewStr() and OverAllocatedStr()')
|
1106 | print(string_lengths %>% head(16))
|
1107 | print(string_lengths %>% tail(5))
|
1108 | Log('')
|
1109 |
|
1110 | Log('%s string allocations, total length = %s, total bytes = %s', commas(num_strings),
|
1111 | commas(sum(strings$str_len)), commas(total_string_bytes))
|
1112 | Log('')
|
1113 | Log('%.2f%% of allocs are strings', num_strings * 100 / num_allocs)
|
1114 | Log('%.2f%% of bytes are strings', total_string_bytes * 100 / total_bytes)
|
1115 | Log('')
|
1116 |
|
1117 | #
|
1118 | # Slabs
|
1119 | #
|
1120 |
|
1121 | Log('NewSlab()')
|
1122 |
|
1123 | num_slabs = nrow(slabs)
|
1124 | slabs %>% group_by(slab_len) %>% count() %>% ungroup() %>%
|
1125 | mutate(n_less_than = cumsum(n),
|
1126 | percent = n_less_than * 100.0 / num_slabs) ->
|
1127 | slab_lengths
|
1128 |
|
1129 | slabs %>% group_by(func_name) %>% count() %>% ungroup() %>%
|
1130 | arrange(desc(n)) -> slab_types
|
1131 |
|
1132 | Log(' Lengths')
|
1133 | print(slab_lengths %>% head())
|
1134 | print(slab_lengths %>% tail(5))
|
1135 | Log('')
|
1136 |
|
1137 | Log(' Slab Types')
|
1138 | print(slab_types %>% head())
|
1139 | print(slab_types %>% tail(5))
|
1140 | Log('')
|
1141 |
|
1142 | total_slab_items = sum(slabs$slab_len)
|
1143 |
|
1144 | Log('%s slabs, total items = %s', commas(num_slabs),
|
1145 | commas(sum(slabs$slab_len)))
|
1146 | Log('%.2f%% of allocs are slabs', num_slabs * 100 / num_allocs)
|
1147 | Log('')
|
1148 |
|
1149 | #
|
1150 | # reserve() calls
|
1151 | #
|
1152 |
|
1153 | # There should be strictly more List::reserve() calls than NewSlab
|
1154 |
|
1155 | Log('::reserve(int n)')
|
1156 | Log('')
|
1157 |
|
1158 | num_reserve = nrow(reserve)
|
1159 | reserve %>% group_by(num_items) %>% count() %>% ungroup() %>%
|
1160 | mutate(n_less_than = cumsum(n),
|
1161 | percent = n_less_than * 100.0 / num_reserve) ->
|
1162 | reserve_args
|
1163 |
|
1164 | Log(' Num Items')
|
1165 | print(reserve_args %>% head(15))
|
1166 | print(reserve_args %>% tail(5))
|
1167 | Log('')
|
1168 |
|
1169 | Log('%s reserve() calls, total items = %s', commas(num_reserve),
|
1170 | commas(sum(reserve$num_items)))
|
1171 | Log('')
|
1172 |
|
1173 | # Accounting for all allocations!
|
1174 | Log('Untyped: %s', commas(num_allocs))
|
1175 | Log('Typed + Str + Slab: %s', commas(num_typed + num_strings + num_slabs))
|
1176 | Log('')
|
1177 |
|
1178 | num_other_typed = num_typed - num_lists
|
1179 |
|
1180 | # Summary table
|
1181 | stats = tibble(task = task_name,
|
1182 | total_bytes_ = commas(total_bytes),
|
1183 | num_allocs_ = commas(num_allocs),
|
1184 | sum_typed_strs_slabs = commas(num_typed + num_strings + num_slabs),
|
1185 | num_reserve_calls = commas(num_reserve),
|
1186 |
|
1187 | percent_list_allocs = Percent(num_lists, num_allocs),
|
1188 | percent_slab_allocs = Percent(num_slabs, num_allocs),
|
1189 | percent_string_allocs = Percent(num_strings, num_allocs),
|
1190 | percent_other_typed_allocs = Percent(num_other_typed, num_allocs),
|
1191 |
|
1192 | percent_list_bytes = Percent(total_list_bytes, total_bytes),
|
1193 | percent_string_bytes = Percent(total_string_bytes, total_bytes),
|
1194 |
|
1195 | allocs_24_bytes_or_less = sprintf('%.1f%%', allocs_24_bytes_or_less),
|
1196 | allocs_48_bytes_or_less = sprintf('%.1f%%', allocs_48_bytes_or_less),
|
1197 | allocs_96_bytes_or_less = sprintf('%.1f%%', allocs_96_bytes_or_less),
|
1198 |
|
1199 | strs_6_bytes_or_less = sprintf('%.1f%%', strs_6_bytes_or_less),
|
1200 | strs_14_bytes_or_less = sprintf('%.1f%%', strs_14_bytes_or_less),
|
1201 | )
|
1202 | summaries$stats[[task_name]] = stats
|
1203 |
|
1204 | summaries$most_common_types[[task_name]] = most_common_types
|
1205 | }
|
1206 |
|
1207 | LoadUftraceTsv = function(in_dir, env) {
|
1208 | for (task in list.files(in_dir)) {
|
1209 | Log('Loading data for task %s', task)
|
1210 | base_dir = file.path(in_dir, task)
|
1211 |
|
1212 | task_env = new.env()
|
1213 | env[[task]] = task_env
|
1214 |
|
1215 | # TSV file, not CSV
|
1216 | task_env$untyped = readTsv(file.path(base_dir, 'all-untyped.tsv'))
|
1217 | task_env$typed = readTsv(file.path(base_dir, 'typed.tsv'))
|
1218 | task_env$strings = readTsv(file.path(base_dir, 'strings.tsv'))
|
1219 | task_env$slabs = readTsv(file.path(base_dir, 'slabs.tsv'))
|
1220 | task_env$reserve = readTsv(file.path(base_dir, 'reserve.tsv'))
|
1221 |
|
1222 | # median string length is 4, mean is 9.5!
|
1223 | Log('UNTYPED')
|
1224 | print(summary(task_env$untyped))
|
1225 | Log('')
|
1226 |
|
1227 | Log('TYPED')
|
1228 | print(summary(task_env$typed))
|
1229 | Log('')
|
1230 |
|
1231 | Log('STRINGS')
|
1232 | print(summary(task_env$strings))
|
1233 | Log('')
|
1234 |
|
1235 | Log('SLABS')
|
1236 | print(summary(task_env$slabs))
|
1237 | Log('')
|
1238 |
|
1239 | Log('RESERVE')
|
1240 | print(summary(task_env$reserve))
|
1241 | Log('')
|
1242 | }
|
1243 | }
|
1244 |
|
1245 | Percent = function(n, total) {
|
1246 | sprintf('%.1f%%', n * 100.0 / total)
|
1247 | }
|
1248 |
|
1249 | PrettyPrintLong = function(d) {
|
1250 | tr = t(d) # transpose
|
1251 |
|
1252 | row_names = rownames(tr)
|
1253 |
|
1254 | for (i in 1:nrow(tr)) {
|
1255 | row_name = row_names[i]
|
1256 | cat(sprintf('%26s', row_name)) # calculated min width manually
|
1257 | cat(sprintf('%20s', tr[i,]))
|
1258 | cat('\n')
|
1259 |
|
1260 | # Extra spacing
|
1261 | if (row_name %in% c('num_reserve_calls',
|
1262 | 'percent_string_bytes',
|
1263 | 'percent_other_typed_allocs',
|
1264 | 'allocs_96_bytes_or_less')) {
|
1265 | cat('\n')
|
1266 | }
|
1267 | }
|
1268 | }
|
1269 |
|
1270 |
|
1271 | UftraceReport = function(env, out_dir) {
|
1272 | # summaries$stats should be a list of 1-row data frames
|
1273 | # summaries$top_types should be a list of types
|
1274 | summaries = new.env()
|
1275 |
|
1276 | for (task_name in names(env)) {
|
1277 | report_out = file.path(out_dir, paste0(task_name, '.txt'))
|
1278 |
|
1279 | Log('Making report for task %s -> %s', task_name, report_out)
|
1280 |
|
1281 | sink(file = report_out)
|
1282 | UftraceTaskReport(env, task_name, summaries)
|
1283 | sink() # reset
|
1284 | }
|
1285 | Log('')
|
1286 |
|
1287 | # Concate all the data frames added to summary
|
1288 | stats = bind_rows(as.list(summaries$stats))
|
1289 |
|
1290 | sink(file = file.path(out_dir, 'summary.txt'))
|
1291 | #print(stats)
|
1292 | #Log('')
|
1293 |
|
1294 | PrettyPrintLong(stats)
|
1295 | Log('')
|
1296 |
|
1297 | mct = summaries$most_common_types
|
1298 | for (task_name in names(mct)) {
|
1299 | Log('Common types in workload %s', task_name)
|
1300 | Log('')
|
1301 |
|
1302 | print(mct[[task_name]] %>% head(5))
|
1303 | Log('')
|
1304 | }
|
1305 | sink()
|
1306 |
|
1307 | # For the REPL
|
1308 | return(list(stats = stats))
|
1309 | }
|
1310 |
|
1311 | main = function(argv) {
|
1312 | action = argv[[1]]
|
1313 | in_dir = argv[[2]]
|
1314 | out_dir = argv[[3]]
|
1315 |
|
1316 | if (action == 'osh-parser') {
|
1317 | ParserReport(in_dir, out_dir)
|
1318 |
|
1319 | } else if (action == 'osh-runtime') {
|
1320 | RuntimeReport(in_dir, out_dir)
|
1321 |
|
1322 | } else if (action == 'vm-baseline') {
|
1323 | VmBaselineReport(in_dir, out_dir)
|
1324 |
|
1325 | } else if (action == 'ovm-build') {
|
1326 | OvmBuildReport(in_dir, out_dir)
|
1327 |
|
1328 | } else if (action == 'compute') {
|
1329 | ComputeReport(in_dir, out_dir)
|
1330 |
|
1331 | } else if (action == 'gc') {
|
1332 | GcReport(in_dir, out_dir)
|
1333 |
|
1334 | } else if (action == 'gc-cachegrind') {
|
1335 | GcCachegrindReport(in_dir, out_dir)
|
1336 |
|
1337 | } else if (action == 'mycpp') {
|
1338 | MyCppReport(in_dir, out_dir)
|
1339 |
|
1340 | } else if (action == 'uftrace') {
|
1341 | d = new.env()
|
1342 | LoadUftraceTsv(in_dir, d)
|
1343 | UftraceReport(d, out_dir)
|
1344 |
|
1345 | } else {
|
1346 | Log("Invalid action '%s'", action)
|
1347 | quit(status = 1)
|
1348 | }
|
1349 | Log('PID %d done', Sys.getpid())
|
1350 | }
|
1351 |
|
1352 | if (length(sys.frames()) == 0) {
|
1353 | # increase ggplot font size globally
|
1354 | #theme_set(theme_grey(base_size = 20))
|
1355 |
|
1356 | main(commandArgs(TRUE))
|
1357 | }
|