-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest.js
More file actions
107 lines (92 loc) · 2.96 KB
/
test.js
File metadata and controls
107 lines (92 loc) · 2.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
// emlet.test.js
const { emlet, Emlet } = require('./emlet')
describe('Emlet', () => {
it('should return a vector for basic text', () => {
const vec = emlet.embed('hello world')
expect(vec).to.be.an('array')
expect(vec.length).to.equal(96)
expect(vec.every(v => typeof v === 'number')).to.be.true
})
it('should support custom dimensions', () => {
const model = new Emlet(128)
const vec = model.embed('test')
expect(vec.length).to.equal(128)
})
it('should support smaller dimensions', () => {
const model = new Emlet(16)
const vec = model.embed('test')
expect(vec.length).to.equal(16)
})
it('should synthesize OOV words', () => {
const vec = emlet.embed('quantaflux')
expect(vec.length).to.equal(96)
})
it('should embed emojis and ZWJ sequences', () => {
const vec1 = emlet.embed('🦄')
const vec2 = emlet.embed('👩🏽🚀')
expect(vec1.length).to.equal(96)
expect(vec2.length).to.equal(96)
})
it('should embed punctuation as standalone', () => {
const vec = emlet.embed('.')
expect(vec.length).to.equal(96)
})
it('should chunk long text and embed each chunk', () => {
const chunks = chunkText(
'This is a long passage that needs to be split into smaller pieces for individual embedding.',
30
)
expect(chunks.length).to.be.above(1)
const vecs = chunks.map(c => emlet.embed(c))
expect(vecs.every(v => v.length === 96)).to.be.true
})
it('should embed long text with rare words', () => {
const model = new Emlet(128)
const vec = model.embed('These long passages with rare words can challenge the affix embedding mechanism.')
expect(vec.length).to.equal(128)
})
it('should find top K similar phrases using cosine similarity', () => {
const phrases = [
'the sun rises in the east',
'a cat sits on the mat',
'early morning light',
'sunshine over the hills',
'darkness before dawn'
]
const input = 'morning sunlight'
const top = topKSimilar(input, phrases, 3)
expect(top.length).to.equal(3)
expect(top[0].score).to.be.above(top[2].score)
})
})
function chunkText(text, maxLen = 80) {
const words = text.split(/\s+/)
const chunks = []
let chunk = ''
for (let word of words) {
if ((chunk + ' ' + word).trim().length > maxLen) {
chunks.push(chunk.trim())
chunk = word
} else {
chunk += ' ' + word
}
}
if (chunk) chunks.push(chunk.trim())
return chunks
}
function cosineSim(a, b) {
const dot = a.reduce((s, v, i) => s + v * b[i], 0)
const normA = Math.sqrt(a.reduce((s, v) => s + v * v, 0))
const normB = Math.sqrt(b.reduce((s, v) => s + v * v, 0))
return dot / (normA * normB + 1e-8)
}
function topKSimilar(input, options, k = 5) {
const base = emlet.embed(input)
return options
.map(text => {
const vec = emlet.embed(text)
return { text, score: cosineSim(base, vec) }
})
.sort((a, b) => b.score - a.score)
.slice(0, k)
}