Type-safe conversation format parsing using parser combinators.
Parse multiple conversation formats (Claude, ChatGPT, Mistral, etc.) into a normalized Generic AST, communicating with Elixir via Erlang ports.
- OCaml 4.14+ (5.0+ recommended)
- Angstrom 0.15.0 - Parser combinators
- Atdgen 2.16.0 - Type generation from ATD schemas
- Yojson 3.0.0 - Low-level JSON parsing
- Alberto - Erlang External Term Format for port communication
- Dune 3.0+ - Build system
parser/
├── lib/
│ ├── conversation_types.atd # ATD schema definitions
│ ├── conversation_types.ml # Generated by atdgen
│ ├── conversation_types.mli
│ ├── generic_conversation.ml # Normalized conversation format
│ ├── generic_conversation.mli
│ ├── format_registry.ml # Auto-detection and registry
│ ├── format_registry.mli
│ ├── parsers/
│ │ ├── claude_parser.ml # Claude JSON format
│ │ ├── chatgpt_parser.ml # ChatGPT export format
│ │ ├── mistral_parser.ml # Mistral format
│ │ └── git_log_parser.ml # Git commit history
│ └── port_interface.ml # Alberto-based port communication
├── bin/
│ └── parser_port.ml # Port executable (stdio interface)
├── test/
│ ├── test_claude_parser.ml
│ ├── test_format_registry.ml
│ └── fixtures/ # Sample conversation files
├── dune-project
└── README.md # This file
Each parser implements the ConversationFormat signature:
module type ConversationFormat = sig
type format_specific_ast
val detect : string -> bool
val parse : string -> (Generic.conversation, string) result
val validate : Generic.conversation -> bool
val metadata : format_metadata
endAll format-specific parsers output a normalized Generic.conversation type:
type conversation = {
id : string;
platform : string option;
timestamp : float;
messages : message list;
artifacts : artifact list;
metadata : (string * string) list;
}The parser runs as a separate OS process, communicating via Erlang External Term Format:
Elixir Port → stdin (ETF) → OCaml Parser → stdout (ETF) → Elixir Port
# Install OPAM (OCaml package manager)
sh <(curl -fsSL https://raw.githubusercontent.com/ocaml/opam/master/shell/install.sh)
# Initialize OPAM
opam init
eval $(opam env)
# Create switch for project
opam switch create anamnesis 5.1.0
eval $(opam env)opam install dune angstrom atdgen yojson alberto cmdliner alcotest qcheck# Generate types from ATD
dune build @atdgen
# Build parser library and executable
dune build
# Run tests
dune runtest# In Elixir
port = Port.open({:spawn, "./parser/_build/default/bin/parser_port.exe"}, [
{:packet, 4},
:binary
])
request = :erlang.term_to_binary(%{
action: :parse,
format: :auto,
content: File.read!("conversation.json")
})
Port.command(port, request)
receive do
{^port, {:data, data}} ->
response = :erlang.binary_to_term(data)
IO.inspect(response.conversation)
end# utop
#require "anamnesis_parser";;
open Anamnesis_parser;;
let content = In_channel.read_all "test.json" in
match Format_registry.parse_auto content with
| Ok conv -> Printf.printf "Parsed %d messages\n" (List.length conv.messages)
| Error err -> Printf.eprintf "Error: %s\n" err- Create ATD schema (if format-specific types needed):
(* lib/claude_types.atd *)
type claude_message = {
uuid: string;
text: string;
sender: string;
created_at: string;
}-
Generate types:
dune build @atdgen -
Implement parser:
(* lib/parsers/new_format_parser.ml *)
module NewFormatParser : Format_registry.ConversationFormat = struct
type format_specific_ast = ...
let detect content =
(* Check for format signatures *)
String.contains content "new_format_marker"
let parse content =
(* Use Angstrom combinators *)
match Angstrom.parse_string ~consume:All parser content with
| Ok ast -> Ok (to_generic ast)
| Error msg -> Error msg
let validate conv = true
let metadata = {
name = "NewFormat";
version = "1.0";
description = "New format parser";
}
end- Register format:
(* lib/format_registry.ml *)
let formats = [
(module ClaudeParser : ConversationFormat);
(module ChatGPTParser : ConversationFormat);
(module NewFormatParser : ConversationFormat); (* Add here *)
]dune runtest(* test/test_parser_properties.ml *)
open QCheck
let prop_parse_idempotent =
Test.make ~count:1000
(Gen.string)
(fun content ->
match Format_registry.parse_auto content with
| Error _ -> true (* Invalid input OK *)
| Ok conv1 ->
let json = Generic.to_json conv1 in
match Format_registry.parse_auto json with
| Ok conv2 -> Generic.equal conv1 conv2
| Error _ -> false
)
let () = QCheck_runner.run_tests [prop_parse_idempotent]- Incremental parsing: Angstrom supports streaming large files
- Zero-copy: Avoid string allocations where possible
- Parallel detection: Format detection can run concurrently
- Target: 100 conversations/sec on modest hardware
See /docs/guides/ocaml-elixir-integration.adoc for detailed integration guide.
Key points:
- Use ports (not NIFs) for fault tolerance
- 4-byte length prefix for message framing
- ETF serialization via Alberto
- Process pool in Elixir (4-8 parser workers)
opam install atdgenIncrease buffer size or implement streaming:
(* Use Angstrom.Buffered for incremental parsing *)Ensure Alberto version matches Elixir OTP version. Test with simple messages first.
- Angstrom Documentation
- ATD/Atdgen Manual
- Alberto (ETF Library)
- Research:
/docs/research/ocaml-parsing-elixir-bridge.adoc - Architecture:
/docs/architecture/system-architecture.adoc
- Implement Claude format parser (Milestone 1)
- Create test fixtures from proving-ground
- Add ChatGPT format parser
- Benchmark and optimize
- Add remaining format parsers