#!/usr/bin/env python3 """ A binary-based pairing function and a list codec based on it. y = BPe(a, b) encodes two positive ints as a single positive int, such that log2(y) = log2(a*b) + log2(log2(a*b)) +1/-2 a, b = BPd(y) decodes. Among the BP_list class's methods are the following: Given a list or other iterable L, of zero or more positive ints, B = BP_list(L) creates a list-like object using BPe pairing. B.append(n) appends a positive int to the list. y = B.as_int() encodes the object to a single positive int. C = BP_list(y) decodes the int back to an equivalent object. L = list(C.iter_pop()) gets, but removes, the contents. Theory and implementation notes are in the Jupyter notebook binary_hyperbolic_pairing.ipynb """ from math import log2 as _log2, floor as _floor from numbers import Integral as _Integral def BPe(a, b): """ Given int a >= 1 and int b >= 1, return the pair encoded as int y >= 1. """ a_sz, b_sz = a.bit_length() - 1, b.bit_length() - 1 a_mant, b_mant = a - (1 << a_sz), b - (1 << b_sz) pair_mant = (a_mant << b_sz) + b_mant pair_sz = a_sz + b_sz return pair_mant_start(pair_sz) + (a_sz << pair_sz) + pair_mant def BPd(y): """ Given int y >= 1 -- an encoded pair, return (a, b) -- the pair. """ pair_sz, start = decode_pair_mant_start(y) offset = y - start a_sz = offset >> pair_sz b_sz = pair_sz - a_sz pair_mant = offset - (a_sz << pair_sz) a_mant = pair_mant >> b_sz b_mant = pair_mant - (a_mant << b_sz) return ((1 << a_sz) + a_mant, (1 << b_sz) + b_mant) def pair_mant_start(n): """ Given n, the number of mantissa bits in a pair, return y_n, the start of the encoded range for such pairs. """ return ((n - 1) << n) + 2 def decode_pair_mant_start(y): """ Given y, where y_n <= y < y_{n + 1}, y_n = (n - 1) * 2**n + 2 return n -- the number of pair mantissa bits in this group, and y_n (a.k.a. pair_mant_start(n)) -- the first code point of group n. I think this works for any y that Python and the machine can handle. I don't have a proof of this. It is specifically made to handle y > 2**(2**52). """ if y == 1: return 0, 1 if y <= 5: return 1, 2 if y < 2**47: # Arbitrary, semi-superstitious # place to switch strategy. m0 = _log2(y - 2) - 1 nmax = _floor(m0 - _log2(m0)) + 2 else: f, j = modf_log2(y) # Doesn't subtract 2 from y. j -= 1 ff, jj = modf_log2(j) # f isn't included in _log2(j). # Deliberately shift the crossover point upwards. nmax = j - jj + _floor(f - ff - .5) + 2 y_nmax = pair_mant_start(nmax) if y >= y_nmax: return nmax, y_nmax else: n = nmax - 1 return n, pair_mant_start(n) _MODF_LOG2_RES = 52 _MODF_LOG2_MUL = 2 ** -_MODF_LOG2_RES def modf_log2(i): """ Given an integer i >= 1, return approximately math.modf(math.log2(i)), i.e. the fractional and integer parts of the base-2 log: log2(i) % 1.0, floor(log2(i)), except o this works with any int i >= 1; o the floor part is returned as an int, not a float; o the result is always exact if i is a power of two; o if 2**n < i < 2**(n + 1) and (f, j) = modf_log2(i), then - j == n; and - 0.0 <= f < 1.0 and f always has about 51 bits of precision. """ assert isinstance(i, _Integral) and i >= 1, i j = i.bit_length() - 1 if j < _MODF_LOG2_RES: i <<= (_MODF_LOG2_RES - j) else: i >>= (j - _MODF_LOG2_RES) f = _log2(i * _MODF_LOG2_MUL) assert f < 1, (f, j) return f, j # ====================== BPd, BPe tests ===================== from time import process_time as _ptime def test_BP(max_y=10**5): """ "Decode" the first max_y code points to pairs, encode back to code points, and compare. """ start = _ptime() for y in range(1, max_y + 1): a, b = BPd(y) ye = BPe(a, b) assert ye == y, ("BPd/BPe failed.", y, a, b, ye) dur = _ptime() - start print("BPe and BPd passed. ", f"{max_y:.2e}", "(dec->enc)'s averaging", f"{dur/max_y:.2e}", "sec.") return True def test_decode_pair_mant_start(n_max=10000): """ For 1 <= n <= n_max and y_n = pair_mant_start(n), test decode_pair_mant_start(y) for y in {y_n - 1, y_n, y_n + 1}. These are the edge cases for pairs with between zero and n_max combined bits of mantissa. """ start = _ptime() prev_y_n = 1 for n in range(1, n_max + 1): y_n = pair_mant_start(n) for y in range(y_n - 1, y_n + 1 + 1): if y < y_n: correct = n - 1, prev_y_n else: correct = n, y_n ans = decode_pair_mant_start(y) if ans != correct: print("decode_pair_mant_start(", y, ") =", ans, "instead of", correct) return False prev_y_n = y_n dur = _ptime() - start print("decode_pair_mant_start() correct for 1 <= n <=", n_max, "in", dur, "sec.") return True # ========================== BP_list class ====================== from numbers import Integral as _Integral class BP_list(object): """ A list-like object that encodes a sequence of zero or more ints >= 1 as a single int >= 1, and/or decodes an int as a sequence of ints, using BPe() and BPd(). >>> L = BP_list( [123, 456, 1492] ) >>> y = L.as_int() # encode >>> M = BP_list(y) # decode >>> M.append(1776) >>> M.pop(0) 123 >>> M.pop(0) 456 >>> list(x for x in M.reversed_pop()) [1776, 1492] >>> len(M) 0 BP_list doesn't support random access, only append at the end, and pop or iterated pop at either end (positions 0 and -1). """ # In the following code there are many sequences like: # a, b = BPd(y); del y # or # y = BPe(a, b); del a, b # or # chunks.append( (length, node) ); del node # or # (length, node) = chunks.pop(0) # (pop deletes.) # The general idea is to let go of extra links # to possibly huge items in memory as soon as possible. # "del var" is done even if we're about to leave the # scope of var, as a way to remember that that var # needs to be deleted right away even if the source # code is edited. def __init__(self, y=1): self.chunks = [] self.length = 0 if y == 1: return try: self.extend(y) # Maybe it's an iterable... return except TypeError: pass # Never mind. assert isinstance(y, _Integral) and y >= 1, \ "initted with neither int >= 1 nor iterable" lengthoid, root = BPd(y); del y # THE RULE TO HANDLE EMPTY LISTS: if root == 1: length = lengthoid - 1 # (1, 1) = empty list. # (n > 1, 1) = list of n - 1 ones. else: length = lengthoid # (1, r > 1) = list of just r. # (n > 1, r > 1) = list of n not-all-ones. chunks = [ (length, root) ]; del root # Break down into power-of-two chunks. while True: last_len, last_node = chunks[-1] pow2 = 1 << (last_len.bit_length() - 1) if last_len > pow2: left_node, right_node = BPd(last_node); del last_node chunks[-1] = (pow2, left_node) chunks.append( (last_len - pow2, right_node) ) del left_node, right_node else: del last_node break self.chunks = chunks; del chunks self.length = length def as_int(self): """ If possible, encode self as an int and empty the list. WARNING: If elements have been popped from the front of the list, it becomes impossible to encode the internal data structure as an int in a reasonable way. """ # Check that the chunks can be encoded. prev_len = None total = 0 for chunk in self.chunks: chunk_len = chunk[0]; del chunk pow2 = 1 << (chunk_len.bit_length() - 1) assert chunk_len == pow2, "Broken chunks structure" assert not prev_len or prev_len > chunk_len, \ "as_int() after pop(0)" total += chunk_len prev_len = chunk_len # Combine chunks right-to-left. assert self.length == total, "total length mismatch" if self.length == 0: node = 1 else: node = self.chunks.pop(-1)[1] while self.chunks: node = BPe(self.chunks.pop(-1)[1], node) # See "THE RULE TO HANDLE EMPTY LISTS" in __init__(), above. if node == 1: return BPe(self.length + 1, node) else: return BPe(self.length, node) def __len__(self): return self.length def append(self, item): chunks = self.chunks chunks.append( (1, item) ) self.length += 1 # Merge same-size chunks on the right end. while len(chunks) >= 2 and chunks[-2][0] == chunks[-1][0]: right_len, right_node = chunks.pop(-1) left_len, left_node = chunks.pop(-1) chunks.append( (left_len + right_len, BPe(left_node, right_node)) ) del left_node, right_node def __iadd__(self, L): """ self += L => self.extend(L) """ self.extend(L) # += is a command, not a function, but it must return # a value for the variable. return self def extend(self, L): """ Append the elements of L to self one by one. """ for x in L: self.append(x) def pop(self, idx): """ Remove and return the first (if idx==0) or last (if idx=-1) element of the list. No other positions can be popped. WARNING: once any elements are popped from the beginning of the list, the remainder of the list can't be encoded with self.as_int(). """ if self.length == 0: raise IndexError("pop from empty list") end_len, end_node = self.chunks.pop(idx) if idx == 0: # Prepare to break down left-most chunks. outer = 0 # The left member of first pair is on left end. inner = 1 # The right member of first pair is further in. ins_pt = 0 # Insert leftovers before the beginning. elif idx == -1: # Prepare to break down right-most chunks. outer = 1 # Right member of last pair is right end. inner = 0 # Left member of last pair is further in. ins_pt = len(self.chunks) # Insert "before past the end." else: raise IndexError("pop index must be 0 or -1") pow2 = 1 << (end_len.bit_length() - 1) assert end_len == pow2, ("chunk len isn't a power of two", end_len) while end_len > 1: # Split the (end_len, end_node) chunk. nodes = BPd(end_node); del end_node end_node, inner_node = nodes[outer], nodes[inner]; del nodes end_len >>= 1 # Return the unused portion. self.chunks.insert(ins_pt, (end_len, inner_node)); del inner_node if idx == -1: ins_pt += 1 self.length -= 1 return end_node def iter_pop(self): while self: yield self.pop(0) def reversed_pop(self): while self: yield self.pop(-1) # =========================== BP_list test ========================== def test_BP_list(n=2000): """ "Decode" the first n positive ints to lists of ints, encode the lists back to single ints, and compare. Also tests that pop(0) followed by as_int() throws an AssertionError. """ L = BP_list( [1, 2, 3] ) L.pop(0) try: y = L.as_int() print("Didn't get AssertionError after pop(0) then as_int().") # NB: pop(0) on *some* BP_lists would leave them okay: # 2 or 1 elements => 1 or 0 elements => as_int() works. return False except AssertionError as e: pass # Assertion error is what we were looking for. start = _ptime() for y in range(1, n + 1): L = list(BP_list(y).iter_pop()) # Decode. if BP_list(L).as_int() != y: # Encode and compare. print("test_BP_list() failed at y =", y) return False dur = _ptime() - start print(f"BP_list passed, {n:g} decode=>encodes in {dur:.2f} sec.") return True # ============================ main (run tests) ========================= from sys import exit as _exit if __name__ == "__main__": try: for test in test_BP, test_decode_pair_mant_start, test_BP_list: assert test(), test.__name__ + " returned False." except Exception as e: print(repr(e)) _exit(1)