Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions tuple/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,4 +54,6 @@ install(FILES
include/array_tuple_intersection_impl.hpp
include/array_tuple_a_not_b.hpp
include/array_tuple_a_not_b_impl.hpp
include/array_of_strings_sketch.hpp
include/array_of_strings_sketch_impl.hpp
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
164 changes: 164 additions & 0 deletions tuple/include/array_of_strings_sketch.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#ifndef ARRAY_OF_STRINGS_SKETCH_HPP_
#define ARRAY_OF_STRINGS_SKETCH_HPP_

#include <memory>
#include <string>

#include "array_tuple_sketch.hpp"
#include "xxhash64.h"

namespace datasketches {

using array_of_strings = array<std::string>;

// default update policy for an array of strings
class default_array_of_strings_update_policy {
public:
default_array_of_strings_update_policy() = default;

array_of_strings create() const;

void update(array_of_strings& array, const array_of_strings& input) const;

void update(array_of_strings& array, const array_of_strings* input) const;
};

/**
* Serializer/deserializer for an array of strings.
*
* Requirements:
* - Array size must be <= 127.
*
* This serde does not perform UTF-8 validation. Callers must ensure strings
* are valid UTF-8 before serialization to guarantee interoperability with
* Java, Go, and Rust implementations.
*/
template<typename Allocator = std::allocator<array_of_strings>>
struct default_array_of_strings_serde {
using summary_allocator = typename std::allocator_traits<Allocator>::template rebind_alloc<array_of_strings>;

explicit default_array_of_strings_serde(const Allocator& allocator = Allocator());

void serialize(std::ostream& os, const array_of_strings* items, unsigned num) const;
void deserialize(std::istream& is, array_of_strings* items, unsigned num) const;
size_t serialize(void* ptr, size_t capacity, const array_of_strings* items, unsigned num) const;
size_t deserialize(const void* ptr, size_t capacity, array_of_strings* items, unsigned num) const;
size_t size_of_item(const array_of_strings& item) const;

private:
summary_allocator summary_allocator_;
static void check_num_nodes(uint8_t num_nodes);
static uint32_t compute_total_bytes(const array_of_strings& item);
};

/**
* Hashes an array of strings using ArrayOfStrings-compatible hashing.
*/
uint64_t hash_array_of_strings_key(const array_of_strings& key);

/**
* Extended class of compact_tuple_sketch for array of strings.
*
* Requirements:
* - Array size must be <= 127.
*
* UTF-8 compatibility:
* Serialized sketches are intended to be language and platform independent.
* Other implementations (Java, Go, Rust) enforce UTF-8 encoding for strings.
* This C++ implementation does not validate UTF-8; it is the caller's
* responsibility to ensure all strings are valid UTF-8 before calling update().
* Non-UTF-8 strings may serialize successfully but will fail to deserialize
* in other language implementations.
*/
template<typename Allocator = std::allocator<array_of_strings>>
class compact_array_of_strings_tuple_sketch:
public compact_tuple_sketch<array_of_strings, Allocator> {
public:
using Base = compact_tuple_sketch<array_of_strings, Allocator>;
using vector_bytes = typename Base::vector_bytes;
using Base::serialize;

/**
* Copy constructor.
* Constructs a compact sketch from another sketch (update or compact)
* @param other sketch to be constructed from
* @param ordered if true make the resulting sketch ordered
*/
template<typename Sketch>
compact_array_of_strings_tuple_sketch(const Sketch& sketch, bool ordered = true);

/**
* This method deserializes a sketch from a given stream.
* @param is input stream
* @param seed the seed for the hash function that was used to create the sketch
* @param sd instance of a SerDe
* @param allocator instance of an Allocator
* @return an instance of the sketch
*/
template<typename SerDe = default_array_of_strings_serde<Allocator>>
static compact_array_of_strings_tuple_sketch deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED,
const SerDe& sd = SerDe(), const Allocator& allocator = Allocator());

/**
* This method deserializes a sketch from a given array of bytes.
* @param bytes pointer to the array of bytes
* @param size the size of the array
* @param seed the seed for the hash function that was used to create the sketch
* @param sd instance of a SerDe
* @param allocator instance of an Allocator
* @return an instance of the sketch
*/
template<typename SerDe = default_array_of_strings_serde<Allocator>>
static compact_array_of_strings_tuple_sketch deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED,
const SerDe& sd = SerDe(), const Allocator& allocator = Allocator());

private:
explicit compact_array_of_strings_tuple_sketch(Base&& base);
};

/**
* Convenience alias for update_tuple_sketch for array of strings
*/
template<typename Allocator = std::allocator<array_of_strings>,
typename Policy = default_array_of_strings_update_policy>
using update_array_of_strings_tuple_sketch = update_tuple_sketch<
array_of_strings,
array_of_strings,
Policy,
Allocator
>;

/**
* Converts an array of strings tuple sketch to a compact sketch (ordered or unordered).
* @param sketch input sketch
* @param ordered optional flag to specify if an ordered sketch should be produced
* @return compact array of strings sketch
*/
template<typename Allocator = std::allocator<array_of_strings>, typename Policy = default_array_of_strings_update_policy>
compact_array_of_strings_tuple_sketch<Allocator> compact_array_of_strings_sketch(
const update_array_of_strings_tuple_sketch<Allocator, Policy>& sketch, bool ordered = true);

} /* namespace datasketches */

#include "array_of_strings_sketch_impl.hpp"

#endif
Loading