diff --git a/content/build/guides/meta.json b/content/build/guides/meta.json index 5eb8092fa..1d52d2ebe 100644 --- a/content/build/guides/meta.json +++ b/content/build/guides/meta.json @@ -9,6 +9,7 @@ "crossmint-nft-minting-app", "working-with-arns", "using-turbo-in-a-browser", - "storing-nfts" + "storing-nfts", + "verifiable-ai" ] } diff --git a/content/build/guides/verifiable-ai/immutable-trust-layer.mdx b/content/build/guides/verifiable-ai/immutable-trust-layer.mdx new file mode 100644 index 000000000..652d7c3c1 --- /dev/null +++ b/content/build/guides/verifiable-ai/immutable-trust-layer.mdx @@ -0,0 +1,47 @@ +--- +title: "The Immutable Trust Layer" +description: "Implement a Lambda Architecture for AI logging that creates tamper-proof audit trails for algorithmic liability" +--- + +import { Callout } from "fumadocs-ui/components/callout"; +import { Steps, Step } from "fumadocs-ui/components/steps"; + +## Output and Liability Verification for AI Systems + +Learn how to implement a Lambda Architecture for AI logging that streams encrypted evidence and builds analytics indices for complete algorithmic accountability. + +## Prerequisites + +Before starting, ensure you have: + +- **Node.js** (v18 or higher) +- **TypeScript** knowledge +- **Arweave Wallet (JWK file)** - We recommend [Wander](https://www.wander.app/) +- **Turbo Credits** - Purchase credits to pay for uploads. See [Turbo Credits guide](/build/upload/turbo-credits) +- Completed [The Verifiable Dataset](/build/guides/verifiable-ai/verifiable-dataset) guide +- Completed [The Signed Model Registry](/build/guides/verifiable-ai/signed-model-registry) guide + +## Overview + +This guide covers: + +- Implementing the Speed Layer for real-time evidence streaming +- Building the Batch Layer with Parquet indices for analytics +- Creating tamper-proof audit trails +- Encrypting sensitive AI outputs +- Querying historical AI decisions +- Establishing algorithmic liability frameworks + + +Content for this guide is coming soon. Check back later for the complete walkthrough. + + +## Summary + +By completing this guide series, you've built a complete verifiable AI infrastructure with: + +- **Verifiable Datasets** ensuring input integrity +- **Signed Model Registry** preventing process drift +- **Immutable Trust Layer** providing output accountability + +Your AI systems now have cryptographic proof of their decisions, creating a foundation for enterprise trust and regulatory compliance. diff --git a/content/build/guides/verifiable-ai/index.mdx b/content/build/guides/verifiable-ai/index.mdx new file mode 100644 index 000000000..b0a03421f --- /dev/null +++ b/content/build/guides/verifiable-ai/index.mdx @@ -0,0 +1,52 @@ +--- +title: "Verifiable AI with AR.IO Network" +description: "Build production-grade verifiable AI systems with immutable data provenance, signed model registries, and tamper-proof audit trails" +--- + +import { + Database, + Shield, + FileCheck, + Lock, +} from "lucide-react"; + +## From Black Box to Glass Box: The Verifiable AI Stack + +The challenge with Enterprise AI is not just performance, but **provenance**. Standard cloud storage is mutable, making it difficult to prove exactly which dataset trained a model or what precise state an AI agent was in during a specific incident. + +To solve **Algorithmic Liability**, AI systems require an immutable root of trust. + +AR.IO Network facilitates this by enabling a **"Glass Box"** architecture: + +- **Verifiable Datasets**: Prove the integrity of training data, whether it lives on S3 or directly on Arweave. +- **Signed Model Registries**: Prevent model drift by verifying weights against on-chain proofs before inference starts. +- **The Trust Layer**: A "Lambda Architecture" for logging that streams encrypted evidence for liability (Speed Layer) and builds Parquet indices for analytics (Batch Layer). + +## What You'll Learn + +In this guide series, you will build a production-grade **Verifiable AI Stack** using TypeScript and the Turbo SDK. + + + } + /> + } + /> + } + /> + + +Each guide builds on the last, creating a complete verifiable AI infrastructure by the end of the series. + +Let's get started. diff --git a/content/build/guides/verifiable-ai/meta.json b/content/build/guides/verifiable-ai/meta.json new file mode 100644 index 000000000..28b5e63c9 --- /dev/null +++ b/content/build/guides/verifiable-ai/meta.json @@ -0,0 +1,9 @@ +{ + "title": "Verifiable AI", + "defaultOpen": false, + "pages": [ + "verifiable-dataset", + "signed-model-registry", + "immutable-trust-layer" + ] +} diff --git a/content/build/guides/verifiable-ai/signed-model-registry.mdx b/content/build/guides/verifiable-ai/signed-model-registry.mdx new file mode 100644 index 000000000..9331a6396 --- /dev/null +++ b/content/build/guides/verifiable-ai/signed-model-registry.mdx @@ -0,0 +1,39 @@ +--- +title: "The Signed Model Registry" +description: "Build a registry that prevents model drift by verifying weights against on-chain proofs before inference starts" +--- + +import { Callout } from "fumadocs-ui/components/callout"; +import { Steps, Step } from "fumadocs-ui/components/steps"; + +## Process Verification for AI Models + +Learn how to build a signed model registry that prevents model drift by verifying weights against on-chain proofs before inference starts. + +## Prerequisites + +Before starting, ensure you have: + +- **Node.js** (v18 or higher) +- **TypeScript** knowledge +- **Arweave Wallet (JWK file)** - We recommend [Wander](https://www.wander.app/) +- **Turbo Credits** - Purchase credits to pay for uploads. See [Turbo Credits guide](/build/upload/turbo-credits) +- Completed [The Verifiable Dataset](/build/guides/verifiable-ai/verifiable-dataset) guide + +## Overview + +This guide covers: + +- Creating cryptographic signatures for model weights +- Storing model metadata on Arweave +- Verifying model integrity before inference +- Implementing a model registry service +- Preventing model drift and tampering + + +Content for this guide is coming soon. Check back later for the complete walkthrough. + + +## Next Steps + +After completing this guide, proceed to [The Immutable Trust Layer](/build/guides/verifiable-ai/immutable-trust-layer) to learn how to create tamper-proof audit trails. diff --git a/content/build/guides/verifiable-ai/verifiable-dataset.mdx b/content/build/guides/verifiable-ai/verifiable-dataset.mdx new file mode 100644 index 000000000..ebe387cc1 --- /dev/null +++ b/content/build/guides/verifiable-ai/verifiable-dataset.mdx @@ -0,0 +1,519 @@ +--- +title: "The Verifiable Dataset" +description: "Create tamper-proof datasets with cryptographic proofs, ensuring data integrity from S3 to Arweave for AI training" +--- + +import { Callout } from "fumadocs-ui/components/callout"; +import { Steps, Step } from "fumadocs-ui/components/steps"; +import { Tabs, Tab } from "fumadocs-ui/components/tabs"; + +Training data is the foundation of any AI model. To ensure provenance, you must be able to prove exactly what data was used to train a specific model version. + +We provide two patterns depending on your data size and requirements: + +- **The Holographic Anchor**: Best for massive data (TB/PB) stored on S3. +- **The Native Data Lake**: Best for high-value data (under 1TB) stored directly on Arweave with a Parquet index. + +## Prerequisites + +Before starting, ensure you have: + +- **Node.js** (v18 or higher) +- **Arweave Wallet (JWK file)** - We recommend [Wander](https://www.wander.app/) +- **Turbo Credits** - Purchase credits to pay for uploads. See [Turbo Credits guide](/build/upload/turbo-credits) +- **TypeScript** knowledge + +### Install Dependencies + +```bash +npm install @ardrive/turbo-sdk parquetjs @ar.io/wayfinder-core @ar.io/sdk +npm install --save-dev @types/node +``` + +## Pattern A: The Holographic Anchor (Off-Chain) + +Use this when your dataset is petabyte-scale or must reside in a specific jurisdiction (GDPR). + +With this pattern, we do not upload the actual file. We upload a **cryptographic fingerprint**. + + + +### Generate Cryptographic Proof + +Create a file `anchor-dataset.ts` to generate a SHA-256 hash of your dataset: + +```typescript +import { TurboFactory } from '@ardrive/turbo-sdk'; +import * as fs from 'fs'; +import * as crypto from 'crypto'; + +export async function createHolographicAnchor(filePath: string, s3Url: string) { + // Setup Turbo client + const jwk = JSON.parse(fs.readFileSync('wallet.json', 'utf-8')); + const turbo = TurboFactory.authenticated({ + privateKey: jwk, + token: 'arweave' + }); + + console.log("1. Generating Cryptographic Proof..."); + + // Hash stream (Efficient for large files, low RAM usage) + const hash = crypto.createHash('sha256'); + const fileStream = fs.createReadStream(filePath); + + // Stream the file through the hash + for await (const chunk of fileStream) { + hash.update(chunk); + } + + const fingerprint = hash.digest('hex'); + + // 2. Prepare the Anchor Payload + const anchor = { + type: 'dataset_anchor', + storage: 's3', + url: s3Url, + sha256: fingerprint, // The mathematical truth + size: fs.statSync(filePath).size, + timestamp: Date.now() + }; + + // 3. Upload Metadata Only + const upload = await turbo.uploadFile({ + fileStreamFactory: () => Buffer.from(JSON.stringify(anchor)), + fileSizeFactory: () => Buffer.byteLength(JSON.stringify(anchor)), + dataItemOpts: { + tags: [ + { name: 'Content-Type', value: 'application/json' }, + { name: 'Type', value: 'Dataset-Anchor' } + ] + } + }); + + console.log(`⚓ Holographic Anchor Minted: ar://${upload.id}`); + return upload.id; +} +``` + + +This approach is memory-efficient for large files. The stream hashing means you can verify petabyte-scale datasets without loading them entirely into RAM. + + + + +### Addressing data with ArNS + +Instead of hardcoding anchor IDs, use ArNS to create a stable reference that always points to the latest anchor version and maintains a permanent version history. + +```typescript +import { ARIO, ANT } from '@ar.io/sdk'; + +async function associateAnchorWithArNS(anchorId: string, arnsName: string, version: string) { + // 1. Get the ANT contract for your ArNS name + // (Assumes you've already purchased/leased the ArNS name via https://arns.app) + const ario = ARIO.mainnet(); + const records = await ario.getArNSRecord({ name: arnsName }); + + if (!records) { + throw new Error(`ArNS name "${arnsName}" not found. Purchase it at https://arns.app first.`); + } + + // 2. Connect to the ANT contract + const ant = ANT.init({ + processId: records.processId, + signer: jwk, // Your Arweave wallet + }); + + // 3. Set the @ record to point to latest version + await ant.setRecord({ + undername: '@', + transactionId: anchorId, + ttlSeconds: 3600, // 1 hour cache + }); + + // 4. Set a versioned undername to permanently reference this version + await ant.setRecord({ + undername: version, // e.g., 'v1', 'v2', '2024-12' + transactionId: anchorId, + ttlSeconds: 3600, + }); + + console.log(` Latest: ar://${arnsName} → ${anchorId}`); + console.log(` Version: ar://${version}_${arnsName} → ${anchorId}`); + + return anchorId; +} +``` + + + + +### Verify the Dataset + +The anchor transaction ID serves as an immutable proof that: +1. A specific dataset existed at a specific time +2. The dataset had a specific SHA-256 hash +3. The dataset was stored at a specific S3 URL + +Anyone can verify the dataset hasn't changed by re-hashing the S3 file and comparing it to the on-chain fingerprint. + + + + +```typescript +import * as crypto from 'crypto'; +import * as fs from 'fs'; + +// Simple approach - faster to implement but single point of failure +// Query the latest version or a specific version with ArNS +async function verifyDataset(identifier: string, localFilePath: string) { + // 1. Fetch the anchor from Arweave + // Use ArNS name for latest version: ar://dataset-anchor + // Use versioned undername for specific version: ar://v1_dataset-anchor + const anchorData = await fetch(`https://arweave.net/${identifier}`); + const anchor = await anchorData.json(); + + // 2. Hash the local file + const hash = crypto.createHash('sha256'); + const fileStream = fs.createReadStream(localFilePath); + + for await (const chunk of fileStream) { + hash.update(chunk); + } + + const localFingerprint = hash.digest('hex'); + + // 3. Compare + if (localFingerprint === anchor.sha256) { + console.log('✅ Dataset verified! Matches on-chain anchor.'); + return true; + } else { + console.log('❌ Dataset verification failed! File has been modified.'); + return false; + } +} +``` + + + + + +```typescript +import { createWayfinderClient, PreferredWithFallbackRoutingStrategy, FastestPingRoutingStrategy, NetworkGatewaysProvider } from '@ar.io/wayfinder-core'; +import { ARIO } from '@ar.io/sdk'; +import * as crypto from 'crypto'; +import * as fs from 'fs'; + +// Production approach - preferred gateway with network fallback for resilience +// Query the latest version or a specific version with ArNS +async function verifyDataset(identifier: string, localFilePath: string) { + // 1. Setup Wayfinder: tries arweave.net first, falls back to top 10 staked gateways + const wayfinder = createWayfinderClient({ + ario: ARIO.mainnet(), + routingStrategy: new PreferredWithFallbackRoutingStrategy({ + preferredGateway: 'https://arweave.net', + fallbackStrategy: new FastestPingRoutingStrategy({ + timeoutMs: 1000, + gatewaysProvider: new NetworkGatewaysProvider({ + ario: ARIO.mainnet(), + sortBy: 'operatorStake', + limit: 10, + }), + }), + }), + }); + + // 2. Fetch the anchor from Arweave via Wayfinder + // Use ArNS name for latest version: ar://dataset-anchor + // Use versioned undername for specific version: ar://v1_dataset-anchor + const anchorData = await wayfinder.request(`ar://${identifier}`); + const anchor = await anchorData.json(); + + // 3. Hash the local file + const hash = crypto.createHash('sha256'); + const fileStream = fs.createReadStream(localFilePath); + + for await (const chunk of fileStream) { + hash.update(chunk); + } + + const localFingerprint = hash.digest('hex'); + + // 4. Compare + if (localFingerprint === anchor.sha256) { + console.log('✅ Dataset verified! Matches on-chain anchor.'); + return true; + } else { + console.log('❌ Dataset verification failed! File has been modified.'); + return false; + } +} +``` + + + + + + + +**Important**: The Holographic Anchor proves a dataset existed with a specific hash, but doesn't make the data itself permanent. For true permanence, use Pattern B. + + +## Pattern B: The Native Data Lake (On-Chain) + +Use this for fine-tuning sets, RAG Knowledge Bases, or benchmarks where you want both the data and its index permanently stored. + +We upload the raw files to Arweave and generate a Parquet Index. This allows training scripts to filter data (e.g., "Give me only train split images") without downloading the entire dataset manifest. + + + +### Upload Files and Build Index + +Create a file `upload-native-lake.ts`: + +```typescript +import { TurboFactory } from '@ardrive/turbo-sdk'; +import * as parquet from 'parquetjs'; +import * as fs from 'fs'; +import * as path from 'path'; + +// Schema: We verify NOT just the ID, but the content metadata too +const schema = new parquet.ParquetSchema({ + filename: { type: 'UTF8' }, + tx_id: { type: 'UTF8' }, // The Arweave Pointer + byte_size: { type: 'INT64' }, + dataset_split: { type: 'UTF8' }, // 'train' vs 'test' + label: { type: 'UTF8' } // e.g. 'pneumonia' +}); + +export async function uploadDatasetWithIndex(baseDir: string) { + const jwk = JSON.parse(fs.readFileSync('wallet.json', 'utf-8')); + const turbo = TurboFactory.authenticated({ + privateKey: jwk, + token: 'arweave' + }); + + const indexRows = []; + const files = fs.readdirSync(baseDir); + + console.log(`🚀 Processing ${files.length} files...`); + + // 1. Upload Files + for (const file of files) { + const filePath = path.join(baseDir, file); + const size = fs.statSync(filePath).size; + + // Example logic to determine label/split from filename - customize for your dataset + const isTrain = file.startsWith('train'); + const label = file.includes('cat') ? 'cat' : 'dog'; + + const upload = await turbo.uploadFile({ + fileStreamFactory: () => fs.createReadStream(filePath), + fileSizeFactory: () => size, + dataItemOpts: { tags: [{ name: 'Content-Type', value: 'image/jpeg' }] } + }); + + // Add to Index (Don't just list it, describe it) + indexRows.push({ + filename: file, + tx_id: upload.id, + byte_size: size, + dataset_split: isTrain ? 'train' : 'test', + label: label + }); + + console.log(` ✓ Uploaded: ${file}`); + } + + // 2. Write Parquet Index + const indexFile = 'dataset_manifest.parquet'; + const writer = await parquet.ParquetWriter.openFile(schema, indexFile); + for (const row of indexRows) await writer.appendRow(row); + await writer.close(); + + // 3. Upload the Index + const manifestUpload = await turbo.uploadFile({ + fileStreamFactory: () => fs.createReadStream(indexFile), + fileSizeFactory: () => fs.statSync(indexFile).size, + dataItemOpts: { + tags: [ + { name: 'Type', value: 'Dataset-Parquet-Manifest' }, + { name: 'Content-Type', value: 'application/octet-stream' } + ] + } + }); + + console.log(`\n🎉 Data Lake Created!`); + console.log(`👉 Index ID: ar://${manifestUpload.id}`); + + return manifestUpload.id; +} +``` + + + +### Addressing data with ArNS + +Instead of hardcoding manifest IDs, use ArNS to create a stable reference that always points to the latest dataset version and maintains a permanent version history. + +```typescript +import { ARIO, ANT } from '@ar.io/sdk'; + +async function associateDatasetWithArNS(manifestId: string, arnsName: string, version: string) { + // 1. Get the ANT contract for your ArNS name + // (Assumes you've already purchased/leased the ArNS name via https://arns.app) + const ario = ARIO.mainnet(); + const records = await ario.getArNSRecord({ name: arnsName }); + + if (!records) { + throw new Error(`ArNS name "${arnsName}" not found. Purchase it at https://arns.app first.`); + } + + // 2. Connect to the ANT contract + const ant = ANT.init({ + processId: records.processId, + signer: jwk, // Your Arweave wallet + }); + + // 3. Set the @ record to point to latest version + await ant.setRecord({ + undername: '@', + transactionId: manifestId, + ttlSeconds: 3600, // 1 hour cache + }); + + // 4. Set a versioned undername to permanently reference this version + await ant.setRecord({ + undername: version, // e.g., 'v1', 'v2', '2024-12' + transactionId: manifestId, + ttlSeconds: 3600, + }); + + console.log(` Latest: ar://${arnsName} → ${manifestId}`); + console.log(` Version: ar://${version}_${arnsName} → ${manifestId}`); + + return manifestId; +} +``` + + + + +### Query the Index + +Training scripts can now query the Parquet index to fetch specific subsets: + + + + +```typescript +import * as parquet from 'parquetjs'; + +// Simple approach - faster to implement but single point of failure +// Query the latest version or a specific version with ArNS +async function getTrainingImages(identifier: string) { + // 1. Download the Parquet index + // Use ArNS name for latest version: ar://medical-imaging + // Use versioned undername for specific version: ar://v1_medical-imaging + const indexData = await fetch(`https://arweave.net/${identifier}`); + const buffer = await indexData.arrayBuffer(); + + // 2. Query for training split + const reader = await parquet.ParquetReader.openBuffer(Buffer.from(buffer)); + const cursor = reader.getCursor(); + + const trainingImages = []; + let record = null; + + while (record = await cursor.next()) { + if (record.dataset_split === 'train') { + trainingImages.push({ + url: `ar://${record.tx_id}`, + label: record.label, + size: record.byte_size + }); + } + } + + await reader.close(); + return trainingImages; +} +``` + + + + + +```typescript +import * as parquet from 'parquetjs'; +import { createWayfinderClient, PreferredWithFallbackRoutingStrategy, FastestPingRoutingStrategy, NetworkGatewaysProvider } from '@ar.io/wayfinder-core'; +import { ARIO } from '@ar.io/sdk'; + +// Production approach - keeps your training pipeline operational even during gateway outages +// Query the latest version or a specific version with ArNS +async function getTrainingImages(identifier: string) { + // 1. Setup Wayfinder: tries arweave.net first, falls back to top 10 staked gateways + const wayfinder = createWayfinderClient({ + ario: ARIO.mainnet(), + routingStrategy: new PreferredWithFallbackRoutingStrategy({ + preferredGateway: 'https://arweave.net', + fallbackStrategy: new FastestPingRoutingStrategy({ + timeoutMs: 1000, + gatewaysProvider: new NetworkGatewaysProvider({ + ario: ARIO.mainnet(), + sortBy: 'operatorStake', + limit: 10, + }), + }), + }), + }); + + // 2. Download the Parquet index + // Use ArNS name for latest version: ar://medical-imaging + // Use versioned undername for specific version: ar://v1_medical-imaging + const indexData = await wayfinder.request(`ar://${identifier}`); + const buffer = await indexData.arrayBuffer(); + + // 3. Query for training split + const reader = await parquet.ParquetReader.openBuffer(Buffer.from(buffer)); + const cursor = reader.getCursor(); + + const trainingImages = []; + let record = null; + + while (record = await cursor.next()) { + if (record.dataset_split === 'train') { + trainingImages.push({ + url: `ar://${record.tx_id}`, + label: record.label, + size: record.byte_size + }); + } + } + + await reader.close(); + return trainingImages; +} +``` + + + + + +**Performance Tip**: The Parquet format allows efficient columnar queries, meaning you can filter millions of records without loading the entire dataset into memory. + + + + +## Summary + +You now have two patterns for creating verifiable datasets: + +1. **Holographic Anchor**: For massive datasets that must stay on S3, create an immutable cryptographic fingerprint on Arweave. +2. **Native Data Lake**: For smaller, high-value datasets, store both the data and a queryable Parquet index permanently on Arweave. + +Both patterns provide cryptographic proof of exactly what data was used to train your AI models, solving the provenance problem for Enterprise AI. + +## Next Steps + +Now that you have verifiable datasets, proceed to [The Signed Model Registry](/build/guides/verifiable-ai/signed-model-registry) to learn how to prevent model drift by verifying weights against on-chain proofs.