reader.pony

use "collections"

class Reader
  """
  Store network data and provide a parsing interface.

  `Reader` provides a way to extract typed data from a sequence of
  bytes. The `Reader` manages the underlying data structures to
  provide a read cursor over a contiguous sequence of bytes. It is
  useful for decoding data that is received over a network or stored
  in a file. Chunk of bytes are added to the `Reader` using the
  `append` method, and typed data is extracted using the getter
  methods.

  For example, suppose we have a UDP-based network data protocol where
  messages consist of the following:

  * `list_size` - the number of items in the following list of items
    as a big-endian 32-bit integer
  * zero or more items of the following data:
    * a big-endian 64-bit floating point number
    * a string that starts with a big-endian 32-bit integer that
      specifies the length of the string, followed by a number of
      bytes that represent the string

  A message would be something like this:

  ```
  [message_length][list_size][float1][string1][float2][string2]...
  ```

  The following program uses a `Reader` to decode a message of
  this type and print them:

  ```pony
  use "buffered"
  use "collections"

  class Notify is InputNotify
    let _env: Env
    new create(env: Env) =>
      _env = env
    fun ref apply(data: Array[U8] iso) =>
      let rb = Reader
      rb.append(consume data)
      try
        while true do
          let len = rb.i32_be()?
          let items = rb.i32_be()?.usize()
          for range in Range(0, items) do
            let f = rb.f32_be()?
            let str_len = rb.i32_be()?.usize()
            let str = String.from_array(rb.block(str_len)?)
            _env.out.print("[(" + f.string() + "), (" + str + ")]")
          end
        end
      end

  actor Main
    new create(env: Env) =>
      env.input(recover Notify(env) end, 1024)
  ```
  """
  embed _chunks: List[(Array[U8] val, USize)] = _chunks.create()
  var _available: USize = 0

  fun size(): USize =>
    """
    Return the number of available bytes.
    """
    _available

  fun ref clear() =>
    """
    Discard all pending data.
    """
    _chunks.clear()
    _available = 0

  fun ref append(data: ByteSeq) =>
    """
    Add a chunk of data.
    """
    let data_array =
      match data
      | let data': Array[U8] val => data'
      | let data': String => data'.array()
      end

    _available = _available + data_array.size()
    _chunks.push((data_array, 0))

  fun ref skip(n: USize) ? =>
    """
    Skip n bytes.
    """
    if _available >= n then
      _available = _available - n
      var rem = n

      while rem > 0 do
        let node = _chunks.head()?
        (var data, var offset) = node()?
        let avail = data.size() - offset

        if avail > rem then
          node()? = (data, offset + rem)
          break
        end

        rem = rem - avail
        _chunks.shift()?
      end

    else
      error
    end

  fun ref block(len: USize): Array[U8] iso^ ? =>
    """
    Return a block as a contiguous chunk of memory.
    Will throw an error if you request a block larger than what is currently
    stored in the `Reader`.
    """
    if _available < len then
      error
    end

    _available = _available - len
    var out = recover Array[U8] .> undefined(len) end
    var i = USize(0)

    while i < len do
      let node = _chunks.head()?
      (let data, let offset) = node()?

      let avail = data.size() - offset
      let need = len - i
      let copy_len = need.min(avail)

      out = recover
        let r = consume ref out
        data.copy_to(r, offset, i, copy_len)
        consume r
      end

      if avail > need then
        node()? = (data, offset + need)
        break
      end

      i = i + copy_len
      _chunks.shift()?
    end

    out


  fun ref read_until(separator: U8): Array[U8] iso^ ? =>
    """
    Find the first occurrence of the separator and return the block of bytes
    before its position. The separator is not included in the returned array,
    but it is removed from the buffer. To read a line of text, prefer line()
    that handles \n and \r\n.
    """
    let b = block(_distance_of(separator)? - 1)?
    u8()?
    b

  fun ref line(keep_line_breaks: Bool = false): String iso^ ? =>
    """
    Return a \n or \r\n terminated line as a string. By default the newline is not
    included in the returned string, but it is removed from the buffer.
    Set `keep_line_breaks` to `true` to keep the line breaks in the returned line.
    """
    let len = _search_length()?

    _available = _available - len
    var out = recover String(len) end
    var i = USize(0)

    while i < len do
      let node = _chunks.head()?
      (let data, let offset) = node()?

      let avail = data.size() - offset
      let need = len - i
      let copy_len = need.min(avail)

      out.append(data, offset, copy_len)

      if avail > need then
        node()? = (data, offset + need)
        break
      end

      i = i + copy_len
      _chunks.shift()?
    end

    let trunc_len: USize =
      if keep_line_breaks then
        0
      elseif (len >= 2) and (out.at_offset(-2)? == '\r') then
        2
      else
        1
      end
    out.truncate(len - trunc_len)

    consume out

  fun ref u8(): U8 ? =>
    """
    Get a U8. Raise an error if there isn't enough data.
    """
    if _available >= 1 then
      _byte()?
    else
      error
    end

  fun ref i8(): I8 ? =>
    """
    Get an I8.
    """
    u8()?.i8()

  fun ref u16_be(): U16 ? =>
    """
    Get a big-endian U16.
    """
    let num_bytes = U16(0).bytewidth()
    if _available >= num_bytes then
      let node = _chunks.head()?
      (var data, var offset) = node()?
      if (data.size() - offset) >= num_bytes then
        let r =
          ifdef bigendian then
            data.read_u16(offset)?
          else
            data.read_u16(offset)?.bswap()
          end

        offset = offset + num_bytes
        _available = _available - num_bytes

        if offset < data.size() then
          node()? = (data, offset)
        else
          _chunks.shift()?
        end
        r
      else
        // single array did not have all the bytes needed
        (u8()?.u16() << 8) or u8()?.u16()
      end
    else
      error
    end

  fun ref u16_le(): U16 ? =>
    """
    Get a little-endian U16.
    """
    let num_bytes = U16(0).bytewidth()
    if _available >= num_bytes then
      let node = _chunks.head()?
      (var data, var offset) = node()?
      if (data.size() - offset) >= num_bytes then
        let r =
          ifdef littleendian then
            data.read_u16(offset)?
          else
            data.read_u16(offset)?.bswap()
          end

        offset = offset + num_bytes
        _available = _available - num_bytes

        if offset < data.size() then
          node()? = (data, offset)
        else
          _chunks.shift()?
        end
        r
      else
        // single array did not have all the bytes needed
        u8()?.u16() or (u8()?.u16() << 8)
      end
    else
      error
    end

  fun ref i16_be(): I16 ? =>
    """
    Get a big-endian I16.
    """
    u16_be()?.i16()

  fun ref i16_le(): I16 ? =>
    """
    Get a little-endian I16.
    """
    u16_le()?.i16()

  fun ref u32_be(): U32 ? =>
    """
    Get a big-endian U32.
    """
    let num_bytes = U32(0).bytewidth()
    if _available >= num_bytes then
      let node = _chunks.head()?
      (var data, var offset) = node()?
      if (data.size() - offset) >= num_bytes then
        let r =
          ifdef bigendian then
            data.read_u32(offset)?
          else
            data.read_u32(offset)?.bswap()
          end

        offset = offset + num_bytes
        _available = _available - num_bytes

        if offset < data.size() then
          node()? = (data, offset)
        else
          _chunks.shift()?
        end
        r
      else
        // single array did not have all the bytes needed
        (u8()?.u32() << 24) or (u8()?.u32() << 16) or
          (u8()?.u32() << 8) or u8()?.u32()
      end
    else
      error
    end

  fun ref u32_le(): U32 ? =>
    """
    Get a little-endian U32.
    """
    let num_bytes = U32(0).bytewidth()
    if _available >= num_bytes then
      let node = _chunks.head()?
      (var data, var offset) = node()?
      if (data.size() - offset) >= num_bytes then
        let r =
          ifdef littleendian then
            data.read_u32(offset)?
          else
            data.read_u32(offset)?.bswap()
          end

        offset = offset + num_bytes
        _available = _available - num_bytes

        if offset < data.size() then
          node()? = (data, offset)
        else
          _chunks.shift()?
        end
        r
      else
        // single array did not have all the bytes needed
        u8()?.u32() or (u8()?.u32() << 8) or
          (u8()?.u32() << 16) or (u8()?.u32() << 24)
      end
    else
      error
    end

  fun ref i32_be(): I32 ? =>
    """
    Get a big-endian I32.
    """
    u32_be()?.i32()

  fun ref i32_le(): I32 ? =>
    """
    Get a little-endian I32.
    """
    u32_le()?.i32()

  fun ref u64_be(): U64 ? =>
    """
    Get a big-endian U64.
    """
    let num_bytes = U64(0).bytewidth()
    if _available >= num_bytes then
      let node = _chunks.head()?
      (var data, var offset) = node()?
      if (data.size() - offset) >= num_bytes then
        let r =
          ifdef bigendian then
            data.read_u64(offset)?
          else
            data.read_u64(offset)?.bswap()
          end

        offset = offset + num_bytes
        _available = _available - num_bytes

        if offset < data.size() then
          node()? = (data, offset)
        else
          _chunks.shift()?
        end
        r
      else
        // single array did not have all the bytes needed
        (u8()?.u64() << 56) or (u8()?.u64() << 48) or
          (u8()?.u64() << 40) or (u8()?.u64() << 32) or
          (u8()?.u64() << 24) or (u8()?.u64() << 16) or
          (u8()?.u64() << 8) or u8()?.u64()
      end
    else
      error
    end

  fun ref u64_le(): U64 ? =>
    """
    Get a little-endian U64.
    """
    let num_bytes = U64(0).bytewidth()
    if _available >= num_bytes then
      let node = _chunks.head()?
      (var data, var offset) = node()?
      if (data.size() - offset) >= num_bytes then
        let r =
          ifdef littleendian then
            data.read_u64(offset)?
          else
            data.read_u64(offset)?.bswap()
          end

        offset = offset + num_bytes
        _available = _available - num_bytes

        if offset < data.size() then
          node()? = (data, offset)
        else
          _chunks.shift()?
        end
        r
      else
        // single array did not have all the bytes needed
        u8()?.u64() or (u8()?.u64() << 8) or
          (u8()?.u64() << 16) or (u8()?.u64() << 24) or
          (u8()?.u64() << 32) or (u8()?.u64() << 40) or
          (u8()?.u64() << 48) or (u8()?.u64() << 56)
      end
    else
      error
    end

  fun ref i64_be(): I64 ? =>
    """
    Get a big-endian I64.
    """
    u64_be()?.i64()

  fun ref i64_le(): I64 ? =>
    """
    Get a little-endian I64.
    """
    u64_le()?.i64()

  fun ref u128_be(): U128 ? =>
    """
    Get a big-endian U128.
    """
    let num_bytes = U128(0).bytewidth()
    if _available >= num_bytes then
      let node = _chunks.head()?
      (var data, var offset) = node()?
      if (data.size() - offset) >= num_bytes then
        let r =
          ifdef bigendian then
            data.read_u128(offset)?
          else
            data.read_u128(offset)?.bswap()
          end

        offset = offset + num_bytes
        _available = _available - num_bytes

        if offset < data.size() then
          node()? = (data, offset)
        else
          _chunks.shift()?
        end
        r
      else
        // single array did not have all the bytes needed
        (u8()?.u128() << 120) or (u8()?.u128() << 112) or
          (u8()?.u128() << 104) or (u8()?.u128() << 96) or
          (u8()?.u128() << 88) or (u8()?.u128() << 80) or
          (u8()?.u128() << 72) or (u8()?.u128() << 64) or
          (u8()?.u128() << 56) or (u8()?.u128() << 48) or
          (u8()?.u128() << 40) or (u8()?.u128() << 32) or
          (u8()?.u128() << 24) or (u8()?.u128() << 16) or
          (u8()?.u128() << 8) or u8()?.u128()
      end
    else
      error
    end

  fun ref u128_le(): U128 ? =>
    """
    Get a little-endian U128.
    """
    let num_bytes = U128(0).bytewidth()
    if _available >= num_bytes then
      let node = _chunks.head()?
      (var data, var offset) = node()?
      if (data.size() - offset) >= num_bytes then
        let r =
          ifdef littleendian then
            data.read_u128(offset)?
          else
            data.read_u128(offset)?.bswap()
          end

        offset = offset + num_bytes
        _available = _available - num_bytes

        if offset < data.size() then
          node()? = (data, offset)
        else
          _chunks.shift()?
        end
        r
      else
        // single array did not have all the bytes needed
        u8()?.u128() or (u8()?.u128() << 8) or
          (u8()?.u128() << 16) or (u8()?.u128() << 24) or
          (u8()?.u128() << 32) or (u8()?.u128() << 40) or
          (u8()?.u128() << 48) or (u8()?.u128() << 56) or
          (u8()?.u128() << 64) or (u8()?.u128() << 72) or
          (u8()?.u128() << 80) or (u8()?.u128() << 88) or
          (u8()?.u128() << 96) or (u8()?.u128() << 104) or
          (u8()?.u128() << 112) or (u8()?.u128() << 120)
      end
    else
      error
    end

  fun ref i128_be(): I128 ? =>
    """
    Get a big-endian I129.
    """
    u128_be()?.i128()

  fun ref i128_le(): I128 ? =>
    """
    Get a little-endian I128.
    """
    u128_le()?.i128()

  fun ref f32_be(): F32 ? =>
    """
    Get a big-endian F32.
    """
    F32.from_bits(u32_be()?)

  fun ref f32_le(): F32 ? =>
    """
    Get a little-endian F32.
    """
    F32.from_bits(u32_le()?)

  fun ref f64_be(): F64 ? =>
    """
    Get a big-endian F64.
    """
    F64.from_bits(u64_be()?)

  fun ref f64_le(): F64 ? =>
    """
    Get a little-endian F64.
    """
    F64.from_bits(u64_le()?)

  fun ref _byte(): U8 ? =>
    """
    Get a single byte.
    """
    let node = _chunks.head()?
    (var data, var offset) = node()?
    let r = data(offset)?

    offset = offset + 1
    _available = _available - 1

    if offset < data.size() then
      node()? = (data, offset)
    else
      _chunks.shift()?
    end
    r

  fun box peek_u8(offset: USize = 0): U8 ? =>
    """
    Peek at a U8 at the given offset. Raise an error if there isn't enough
    data.
    """
    _peek_byte(offset)?

  fun box peek_i8(offset: USize = 0): I8 ? =>
    """
    Peek at an I8.
    """
    peek_u8(offset)?.i8()

  fun box peek_u16_be(offset: USize = 0): U16 ? =>
    """
    Peek at a big-endian U16.
    """
    (peek_u8(offset)?.u16() << 8) or peek_u8(offset + 1)?.u16()

  fun box peek_u16_le(offset: USize = 0): U16 ? =>
    """
    Peek at a little-endian U16.
    """
    peek_u8(offset)?.u16() or (peek_u8(offset + 1)?.u16() << 8)

  fun box peek_i16_be(offset: USize = 0): I16 ? =>
    """
    Peek at a big-endian I16.
    """
    peek_u16_be(offset)?.i16()

  fun box peek_i16_le(offset: USize = 0): I16 ? =>
    """
    Peek at a little-endian I16.
    """
    peek_u16_le(offset)?.i16()

  fun box peek_u32_be(offset: USize = 0): U32 ? =>
    """
    Peek at a big-endian U32.
    """
    (peek_u16_be(offset)?.u32() << 16) or peek_u16_be(offset + 2)?.u32()

  fun box peek_u32_le(offset: USize = 0): U32 ? =>
    """
    Peek at a little-endian U32.
    """
    peek_u16_le(offset)?.u32() or (peek_u16_le(offset + 2)?.u32() << 16)

  fun box peek_i32_be(offset: USize = 0): I32 ? =>
    """
    Peek at a big-endian I32.
    """
    peek_u32_be(offset)?.i32()

  fun box peek_i32_le(offset: USize = 0): I32 ? =>
    """
    Peek at a little-endian I32.
    """
    peek_u32_le(offset)?.i32()

  fun box peek_u64_be(offset: USize = 0): U64 ? =>
    """
    Peek at a big-endian U64.
    """
    (peek_u32_be(offset)?.u64() << 32) or peek_u32_be(offset + 4)?.u64()

  fun box peek_u64_le(offset: USize = 0): U64 ? =>
    """
    Peek at a little-endian U64.
    """
    peek_u32_le(offset)?.u64() or (peek_u32_le(offset + 4)?.u64() << 32)

  fun box peek_i64_be(offset: USize = 0): I64 ? =>
    """
    Peek at a big-endian I64.
    """
    peek_u64_be(offset)?.i64()

  fun box peek_i64_le(offset: USize = 0): I64 ? =>
    """
    Peek at a little-endian I64.
    """
    peek_u64_le(offset)?.i64()

  fun box peek_u128_be(offset: USize = 0): U128 ? =>
    """
    Peek at a big-endian U128.
    """
    (peek_u64_be(offset)?.u128() << 64) or peek_u64_be(offset + 8)?.u128()

  fun box peek_u128_le(offset: USize = 0): U128 ? =>
    """
    Peek at a little-endian U128.
    """
    peek_u64_le(offset)?.u128() or (peek_u64_le(offset + 8)?.u128() << 64)

  fun box peek_i128_be(offset: USize = 0): I128 ? =>
    """
    Peek at a big-endian I129.
    """
    peek_u128_be(offset)?.i128()

  fun box peek_i128_le(offset: USize = 0): I128 ? =>
    """
    Peek at a little-endian I128.
    """
    peek_u128_le(offset)?.i128()

  fun box peek_f32_be(offset: USize = 0): F32 ? =>
    """
    Peek at a big-endian F32.
    """
    F32.from_bits(peek_u32_be(offset)?)

  fun box peek_f32_le(offset: USize = 0): F32 ? =>
    """
    Peek at a little-endian F32.
    """
    F32.from_bits(peek_u32_le(offset)?)

  fun box peek_f64_be(offset: USize = 0): F64 ? =>
    """
    Peek at a big-endian F64.
    """
    F64.from_bits(peek_u64_be(offset)?)

  fun box peek_f64_le(offset: USize = 0): F64 ? =>
    """
    Peek at a little-endian F64.
    """
    F64.from_bits(peek_u64_le(offset)?)

  fun box _peek_byte(offset: USize = 0): U8 ? =>
    """
    Get the byte at the given offset without moving the cursor forward.
    Raise an error if the given offset is not yet available.
    """
    var offset' = offset
    var iter = _chunks.nodes()

    while true do
      let node = iter.next()?
      (var data, var node_offset) = node()?
      offset' = offset' + node_offset

      let data_size = data.size()
      if offset' >= data_size then
        offset' = offset' - data_size
      else
        return data(offset')?
      end
    end

    error

  fun ref _distance_of(byte: U8): USize ? =>
    """
    Get the distance to the first occurrence of the given byte
    """
    if _chunks.size() == 0 then
      error
    end

    var node = _chunks.head()?
    var search_len: USize = 0

    while true do
      (var data, var offset) = node()?

      try
        let len = (search_len + data.find(byte, offset)? + 1) - offset
        search_len = 0
        return len
      end

      search_len = search_len + (data.size() - offset)

      if not node.has_next() then
        break
      end

      node = node.next() as ListNode[(Array[U8] val, USize)]
    end

    error

  fun ref _search_length(): USize ? =>
    """
    Get the length of a pending line. Raise an error if there is no pending
    line.
    """
    _distance_of('\n')?