| 1 | #!/usr/bin/env python2 | 
| 2 | """ | 
| 3 | oheap2.py | 
| 4 |  | 
| 5 | OVM2 stores objects in a data structure called OHeap2.  It has the following | 
| 6 | concepts. | 
| 7 |  | 
| 8 | - Handles: 4-byte / 32-bit integers that point to Cells. | 
| 9 | - Cells: 16-byte structs with a type tag, in a contiguous array.  This array is | 
| 10 | scanned by the GC. | 
| 11 | - Slabs: Variable-length arrays of memory that are owned by a Cell.  Each slab | 
| 12 | may be malloc'd individually. | 
| 13 | - In the case of a string, it's opaque to the GC. | 
| 14 | - In the case of a tuple or struct, it must be scanned by the GC. | 
| 15 |  | 
| 16 | This is called the "4-16-N design". | 
| 17 |  | 
| 18 | Every cell has a type tag, which is simply an integer.  Negative integers | 
| 19 | indicate native types implemented in C.  Positive integers are for user-defined | 
| 20 | types. | 
| 21 |  | 
| 22 | Cells also have an is_slab bit and a length for small.  TODO: is_slab should be | 
| 23 | is_small? | 
| 24 |  | 
| 25 | Operations on OHeap | 
| 26 | ------------------- | 
| 27 |  | 
| 28 | 1. Allocate a new cell, possibly resizing | 
| 29 |  | 
| 30 | 2. Garbage Collect | 
| 31 | - Walk all the cells.  Free unused cells along with their slabs. | 
| 32 |  | 
| 33 | 3. Save to file | 
| 34 | Slabs: Native Pointers to Offsets | 
| 35 |  | 
| 36 | 3a. Copy an object tree to a fresh oheap?  e.g. for IPC.  I guess this is like "Save". | 
| 37 | - Walk all the cells and slabs | 
| 38 |  | 
| 39 | 4. Load from file into fresh OHeap | 
| 40 | - Patch Slab Offsets to Native Pointers | 
| 41 |  | 
| 42 | 4a. Load from file and merge into existing OHeap? | 
| 43 | - Inside a namespace? | 
| 44 | - Import? | 
| 45 | - Patch all cell refs! | 
| 46 |  | 
| 47 | 5. Make permanent?  Move to another "generation"?  OPy classes and bytecode | 
| 48 | perhaps fit in this category. | 
| 49 |  | 
| 50 | TODO | 
| 51 | ---- | 
| 52 |  | 
| 53 | - Bits on handle: whether it's const or not? | 
| 54 | - negative handles could be const? | 
| 55 |  | 
| 56 | - Singleton values for None/True/False | 
| 57 | - interning of strings | 
| 58 | - hash tables, and hashes of strings | 
| 59 |  | 
| 60 | Introspection | 
| 61 | ------------- | 
| 62 |  | 
| 63 | These operations should work: | 
| 64 |  | 
| 65 | type() | 
| 66 | getattr() -- hm in OVM I don't want to look at base classes | 
| 67 | dir() -- list all attributes.  ditto -- don't look at base classes? | 
| 68 | hm except this returns different stuff for modules, classes, and objects | 
| 69 | it looks at superclasses | 
| 70 | isinstance(obj, typ) - check if the type tag of obj points at typ or a superclass | 
| 71 | """ | 
| 72 | from __future__ import print_function | 
| 73 |  | 
| 74 | import struct | 
| 75 | import sys | 
| 76 | import types | 
| 77 |  | 
| 78 | from core.util import log | 
| 79 |  | 
| 80 |  | 
| 81 | TAG_NONE = -1 | 
| 82 | TAG_BOOL = -2 | 
| 83 | TAG_INT = -3 | 
| 84 | TAG_FLOAT = -4 | 
| 85 | TAG_STR = -5 | 
| 86 | TAG_TUPLE = -6 | 
| 87 | TAG_CODE = -7 | 
| 88 |  | 
| 89 | MIN_SMALL_INT = -(1 << 63) | 
| 90 | MAX_SMALL_INT = (1 << 63) - 1 | 
| 91 |  | 
| 92 | MAX_LEN_SMALL_STR = 11  # 12 bytes - 1 for NUL | 
| 93 |  | 
| 94 | MAX_LEN_SMALL_TUPLE = 3  # 4 Handles in 12 bytes? | 
| 95 | # We can steal a bit from the tag for the small/big | 
| 96 | # bit. | 
| 97 |  | 
| 98 | def u8(i):  # 1 byte unsigned | 
| 99 | return struct.pack('B', i) | 
| 100 |  | 
| 101 | def i16(i):  # 2 bytes | 
| 102 | return struct.pack('h', i) | 
| 103 |  | 
| 104 | def i32(i):  # 4 bytes | 
| 105 | return struct.pack('i', i) | 
| 106 |  | 
| 107 | def i64(i):  # 8 bytes (long long) | 
| 108 | return struct.pack('q', i) | 
| 109 |  | 
| 110 | def f64(i):  # 8 byte double | 
| 111 | return struct.pack('d', i) | 
| 112 |  | 
| 113 |  | 
| 114 | def Align4(i): | 
| 115 | """Round up to the nearest multiple of 4.  See unit tests.""" | 
| 116 | return ((i-1) | 3) + 1 | 
| 117 |  | 
| 118 |  | 
| 119 | def Align16(i): | 
| 120 | """Round up to the nearest multiple of 16.  See unit tests.""" | 
| 121 | return ((i-1) | 15) + 1 | 
| 122 |  | 
| 123 |  | 
| 124 | class Encoder(object): | 
| 125 | """ | 
| 126 | Write objects into an OHeap2 structure that can be lazily loaded. | 
| 127 |  | 
| 128 | First pass: | 
| 129 | Append to cells and append to slabs | 
| 130 | Second pass: | 
| 131 | Write slabs as bytes, and then patch offsets in cells? | 
| 132 | Write all the cells | 
| 133 | Write the root object at the front of the file?  Or do it at the end? | 
| 134 | OHeap writes it at the beginning after | 
| 135 | """ | 
| 136 |  | 
| 137 | def __init__(self): | 
| 138 | self.chunk = bytearray() | 
| 139 | # An array of cells | 
| 140 | self.cells = [] | 
| 141 | # Write all these first?  So that the cells can point to them. | 
| 142 | self.slabs = [] | 
| 143 |  | 
| 144 | def Any(self, obj): | 
| 145 | """ | 
| 146 | Encode an object and return its id. | 
| 147 | """ | 
| 148 | id_ = len(self.cells) | 
| 149 |  | 
| 150 | # TODO: Enforce that None is a singleton.  But what about joining OHeaps? | 
| 151 | if isinstance(obj, types.NoneType): | 
| 152 | self.cells.append((TAG_NONE, False, None)) | 
| 153 |  | 
| 154 | # TODO: Enforce that True/False is a singleton.  But what about joining | 
| 155 | # OHeaps? | 
| 156 | elif isinstance(obj, bool): | 
| 157 | self.cells.append((TAG_BOOL, False, obj)) | 
| 158 |  | 
| 159 | elif isinstance(obj, int): | 
| 160 | i = obj | 
| 161 | if MIN_SMALL_INT < i < MAX_SMALL_INT: | 
| 162 | self.cells.append((TAG_INT, False, i)) | 
| 163 | else: | 
| 164 | raise NotImplementedError | 
| 165 |  | 
| 166 | elif isinstance(obj, float): | 
| 167 | raise NotImplementedError | 
| 168 |  | 
| 169 | # TODO: Identical strings could be "interned" here. | 
| 170 | elif isinstance(obj, str): | 
| 171 | s = obj | 
| 172 | n = len(s) | 
| 173 | if n < MAX_LEN_SMALL_STR: | 
| 174 | self.cells.append((TAG_STR, False, s)) | 
| 175 | else: | 
| 176 | # length and they payload | 
| 177 | slab_index = len(self.slabs) | 
| 178 | self.slabs.append((n, s)) | 
| 179 | self.cells.append((TAG_STR, True, slab_index)) | 
| 180 |  | 
| 181 | elif isinstance(obj, tuple): | 
| 182 | t = obj | 
| 183 | refs = [] | 
| 184 | for item in t: | 
| 185 | refs.append(self.Any(item))  # Depth-first. | 
| 186 |  | 
| 187 | # Compute ID after adding all the children. | 
| 188 | id_ = len(self.cells) | 
| 189 |  | 
| 190 | n = len(t) | 
| 191 | if n < MAX_LEN_SMALL_TUPLE: | 
| 192 | # TODO: How do we know how long it is? | 
| 193 | self.cells.append((TAG_TUPLE, False, refs)) | 
| 194 | else: | 
| 195 | slab_index = len(self.slabs) | 
| 196 | self.slabs.append((n, refs)) | 
| 197 | self.cells.append((TAG_TUPLE, True, slab_index)) | 
| 198 |  | 
| 199 | elif isinstance(obj, types.CodeType): | 
| 200 | co = obj | 
| 201 | refs = [] | 
| 202 |  | 
| 203 | # Ints first | 
| 204 | refs.append(self.Any(co.co_argcount)) | 
| 205 | refs.append(self.Any(co.co_nlocals)) | 
| 206 | refs.append(self.Any(co.co_stacksize)) | 
| 207 | refs.append(self.Any(co.co_flags)) | 
| 208 | refs.append(self.Any(co.co_firstlineno)) | 
| 209 |  | 
| 210 | # Strings | 
| 211 | refs.append(self.Any(co.co_name)) | 
| 212 | refs.append(self.Any(co.co_filename)) | 
| 213 | refs.append(self.Any(co.co_code))  # bytecode | 
| 214 |  | 
| 215 | # Tuples | 
| 216 | refs.append(self.Any(co.co_names)) | 
| 217 | refs.append(self.Any(co.co_varnames)) | 
| 218 | refs.append(self.Any(co.co_consts)) | 
| 219 |  | 
| 220 | slab_index = len(self.slabs) | 
| 221 | self.slabs.append((len(refs), refs)) | 
| 222 | self.cells.append((TAG_CODE, True, slab_index)) | 
| 223 |  | 
| 224 | else: | 
| 225 | raise AssertionError | 
| 226 |  | 
| 227 | return id_ | 
| 228 |  | 
| 229 | def Write(self, f): | 
| 230 | f.write('OHP2')  # magic header | 
| 231 | f.write(i32(0))  # placeholder for cell offset | 
| 232 | f.write(i32(0))  # placeholder for root object.  NOTE: We could always | 
| 233 | # fseek(SEEK_END, -4)?  But I think we want it to more | 
| 234 | # self-describing. | 
| 235 |  | 
| 236 | # Write slabs first, so we know their offsets. | 
| 237 | slab_offsets = [] | 
| 238 | pos = 0  # Reader should patch relative to 12 | 
| 239 |  | 
| 240 | for length, payload in self.slabs: | 
| 241 |  | 
| 242 | if isinstance(payload, str): | 
| 243 | slab_offsets.append(pos) | 
| 244 |  | 
| 245 | f.write(i32(length))  # length in bytes | 
| 246 | pos += 4 | 
| 247 |  | 
| 248 | n = len(payload) | 
| 249 | aligned = Align4(n+1)  # at least NUL byte for terminator | 
| 250 | f.write(payload) | 
| 251 | f.write('\0' * (aligned - n))  # padding | 
| 252 | pos += aligned | 
| 253 |  | 
| 254 | elif isinstance(payload, list):  # list of references | 
| 255 | slab_offsets.append(pos) | 
| 256 |  | 
| 257 | # length in references.  Could be unsigned? | 
| 258 | f.write(i32(length)) | 
| 259 |  | 
| 260 | # NOTE: There is no GC offset, since all of them are scanned? | 
| 261 | for ref in payload: | 
| 262 | f.write(i32(ref)) | 
| 263 | pos += 4 | 
| 264 | pos += len(payload) * 4 | 
| 265 |  | 
| 266 | else: | 
| 267 | raise AssertionError(payload) | 
| 268 |  | 
| 269 | log('Slab offsets: %s', slab_offsets) | 
| 270 |  | 
| 271 | # Pad out the slabs so that the cells begins at a multiple of 16. | 
| 272 | total_slab_size = Align16(pos)  # including pad, but not including header. | 
| 273 | f.write('\0' * (total_slab_size - pos))  # Write the pad | 
| 274 |  | 
| 275 | # Encode it into 16 bytes | 
| 276 | for tag, is_slab, val in self.cells: | 
| 277 | #log('%s %s %s', tag, is_slab, val) | 
| 278 |  | 
| 279 | # 4 byte tag.  This may be patched into a type Handle? | 
| 280 | # Reserve LSB for is_slab?  Or highest bit? | 
| 281 | # The tag could be (i >> 1) then? | 
| 282 | f.write(i16(tag)) | 
| 283 |  | 
| 284 | if tag == TAG_NONE: | 
| 285 | f.write(i16(0)) | 
| 286 | f.write(i32(0)) | 
| 287 | f.write(i32(0)) | 
| 288 | f.write(i32(0)) | 
| 289 |  | 
| 290 | elif tag == TAG_BOOL: | 
| 291 | f.write(i16(0))  # pad | 
| 292 | f.write(i32(0))  # pad | 
| 293 | f.write(i32(0))  # pad | 
| 294 | f.write(i32(int(val)))  # 0 or 1 | 
| 295 |  | 
| 296 | elif tag == TAG_INT: | 
| 297 | assert not is_slab, val | 
| 298 | f.write(i16(0))  # Padding | 
| 299 | f.write(i32(0))  # Padding | 
| 300 | f.write(i64(val)) | 
| 301 |  | 
| 302 | elif tag == TAG_FLOAT: | 
| 303 | assert not is_slab, val | 
| 304 | f.write(i16(0))  # Padding | 
| 305 | f.write(i32(0))  # Padding | 
| 306 | f.write(f64(val)) | 
| 307 |  | 
| 308 | elif tag == TAG_STR: | 
| 309 | if is_slab: | 
| 310 | # For a string, a big/small bit could technically be in the last | 
| 311 | # byte.  To reuse NUL terminator.  But I want the patching process to | 
| 312 | # be consistent. | 
| 313 | slab_index = val | 
| 314 | offset = slab_offsets[slab_index] | 
| 315 | f.write(u8(1))  # is_slab | 
| 316 | f.write(u8(0))  # length stored in slab | 
| 317 | f.write(i32(0))  # pad | 
| 318 | f.write(i32(0))  # pad | 
| 319 | f.write(i32(offset)) | 
| 320 | else: | 
| 321 | n = len(val) | 
| 322 | f.write(u8(0))  # is_slab | 
| 323 | f.write(u8(n))  # length stored here | 
| 324 | f.write(val) | 
| 325 | num_pad = 12 - n  # at least one NUL | 
| 326 | f.write('\0' * num_pad) | 
| 327 |  | 
| 328 | elif tag == TAG_TUPLE: | 
| 329 | if is_slab: | 
| 330 | slab_index = val | 
| 331 | offset = slab_offsets[slab_index] | 
| 332 | f.write(u8(1))  # is_slab | 
| 333 | f.write(u8(0))  # length stored in slab | 
| 334 | f.write(i32(0))  # pad | 
| 335 | f.write(i32(0))  # pad | 
| 336 | f.write(i32(offset)) | 
| 337 | else: | 
| 338 | n = len(val) | 
| 339 | f.write(u8(0))  # is_slab | 
| 340 | f.write(u8(n)) | 
| 341 | # TODO: how is length represented? | 
| 342 | for ref in val: | 
| 343 | f.write(i32(ref)) | 
| 344 | num_pad = 3 - len(val) | 
| 345 | for i in xrange(num_pad): | 
| 346 | f.write(i32(0)) | 
| 347 |  | 
| 348 | elif tag == TAG_CODE: | 
| 349 | assert is_slab, val | 
| 350 | slab_index = val | 
| 351 | #log('slab_index %d', slab_index) | 
| 352 | offset = slab_offsets[slab_index] | 
| 353 | f.write(u8(1)) # is_slab | 
| 354 | f.write(u8(0)) # length stored in slab | 
| 355 | f.write(i32(0))  # pad | 
| 356 | f.write(i32(0))  # pad | 
| 357 | f.write(i32(offset)) | 
| 358 |  | 
| 359 | else: | 
| 360 | raise AssertionError(tag) | 
| 361 |  | 
| 362 | log('') | 
| 363 | log('slabs') | 
| 364 | for slab in self.slabs: | 
| 365 | log('\t%r', slab) | 
| 366 |  | 
| 367 | log('cells') | 
| 368 | for c in self.cells: | 
| 369 | #log('\t%r', c) | 
| 370 | pass | 
| 371 |  | 
| 372 | log('%d slabs in %d bytes', len(self.slabs), total_slab_size) | 
| 373 | log('%d cells in %d bytes', len(self.cells), f.tell() - 12 - total_slab_size) | 
| 374 |  | 
| 375 | # Fill in the cell position | 
| 376 | f.seek(4) | 
| 377 | f.write(i32(total_slab_size)) | 
| 378 | f.write(i32(len(self.cells))) | 
| 379 |  | 
| 380 |  | 
| 381 | def Write(co, f): | 
| 382 | print(co) | 
| 383 | enc = Encoder() | 
| 384 | enc.Any(co) | 
| 385 | enc.Write(f) | 
| 386 |  | 
| 387 |  | 
| 388 | def main(argv): | 
| 389 | chunk = bytearray() | 
| 390 | chunk.extend('hello') | 
| 391 | chunk.append('\0') | 
| 392 |  | 
| 393 | print('Hello from oheap2.py') | 
| 394 | print(repr(chunk)) | 
| 395 |  | 
| 396 |  | 
| 397 | if __name__ == '__main__': | 
| 398 | try: | 
| 399 | main(sys.argv) | 
| 400 | except RuntimeError as e: | 
| 401 | print('FATAL: %s' % e, file=sys.stderr) | 
| 402 | sys.exit(1) |