Added saving/loading via zon, fixed parsing bugs.

This commit is contained in:
2026-05-29 08:45:21 -05:00
parent 4545aa5da0
commit 911a6483a6
5 changed files with 1132292 additions and 75 deletions
+20 -2
View File
@@ -3,9 +3,27 @@ A implementation of a Markov Chain random post generator in zig.
## Usage
```zigkov posts.txt```
File contents need to be organized into sentences ending with a deliminator, the default is `.`. The weight function effects the chances of ending the selection of the word group.
The text file is a flat database of text, with each post seperated by ```~```.
```
Markov Chains in Zig (https://occultusterra.com/WWelna/zigkov)
Copyright (C) 2026 William Welna (wwelna@occultusterra.com)
-h, --help
Display this help and exit.
-p, --process <str>
Read text file & build markov chains, saves as `filename`.zon.
-d, --deliminator <str>
Deliminator to split the strings into, default of `.`.
-w, --weight <str>
Weight of the random() function adjustment, default 0.01.
-m, --markov <str>
Read/use previously saved markov chain.
```
## The No Vibe Coders Open Source License
+79822
View File
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
+1008538
View File
File diff suppressed because it is too large Load Diff
+163 -73
View File
@@ -42,10 +42,30 @@
const std = @import("std");
const clap = @import("clap");
const zon = std.zon;
const Word = struct {
// For Generating zon
const processWord = struct {
word: std.ArrayList(u8),
next: std.ArrayList(WordStat),
next: std.ArrayList(processWordStat),
hash: u64,
end_count: f64,
end_normalized: f64,
start_count: f64,
start_normalized: f64,
};
const processWordStat = struct {
word: std.ArrayList(u8),
hash: u64,
count: f64,
normalized: f64,
};
// For Reading zon and using + Slices instead of ArrayLists
const Word = struct {
word: []const u8,
next: []const WordStat,
hash: u64,
end_count: f64,
end_normalized: f64,
@@ -54,15 +74,14 @@ const Word = struct {
};
const WordStat = struct {
word: std.ArrayList(u8),
word: []const u8,
hash: u64,
count: f64,
normalized: f64,
};
var MarkovChain: std.ArrayList(Word) = undefined;
pub fn fnv(word: []const u8) u64 {
// Hash function for quick searches
fn fnv(word: []const u8) u64 {
var hash: u64 = 0xcbf29ce484222325;
for (word) |byte| {
hash *%= 0x100000001b3;
@@ -71,16 +90,15 @@ pub fn fnv(word: []const u8) u64 {
return hash;
}
pub inline fn clean(allocator: std.mem.Allocator, word: []const u8, output: *std.ArrayList(u8)) !void {
//const filter:[]u8 = "$@\\€-_";
for (word) |x| { // pretty
inline fn clean(allocator: std.mem.Allocator, word: []const u8, output: *std.ArrayList(u8)) !void {
if (word.len == 1 and word[0] == 'I') try output.append(allocator, word[0]) else for (word) |x| {
if (std.ascii.isAlphabetic(x)) {
try output.append(allocator, std.ascii.toLower(x));
}
}
}
pub inline fn add(allocator: std.mem.Allocator, chains: *std.ArrayList(Word), word: []const u8, next: []const u8, is_start: bool) !void {
inline fn add(allocator: std.mem.Allocator, chains: *std.ArrayList(processWord), word: []const u8, next: []const u8, is_start: bool) !void {
var updated: bool = false;
var w: std.ArrayList(u8) = .empty;
var n: std.ArrayList(u8) = .empty;
@@ -110,13 +128,13 @@ pub inline fn add(allocator: std.mem.Allocator, chains: *std.ArrayList(Word), wo
}
if (updated == false) { // No point setting this as not used again
// Add new entry word/next pair
var entry: std.ArrayList(WordStat) = .empty;
var entry: std.ArrayList(processWordStat) = .empty;
try entry.append(allocator, .{ .word = n, .hash = fnv(n.items), .count = 1, .normalized = 0 });
try chains.append(allocator, .{ .word = w, .next = entry, .hash = fnv(w.items), .end_count = 0, .end_normalized = 0, .start_count = if (is_start) 1 else 0, .start_normalized = 0 });
}
}
pub inline fn add_end(allocator: std.mem.Allocator, chains: *std.ArrayList(Word), word: []const u8) !void {
inline fn add_end(allocator: std.mem.Allocator, chains: *std.ArrayList(processWord), word: []const u8) !void {
var updated: bool = false;
var w: std.ArrayList(u8) = .empty;
try clean(allocator, word, &w);
@@ -135,9 +153,9 @@ pub inline fn add_end(allocator: std.mem.Allocator, chains: *std.ArrayList(Word)
}
}
pub inline fn do_stats(io: std.Io, allocator: std.mem.Allocator, rand: std.Random, chains: *std.ArrayList(Word), data: []const u8) !void {
inline fn do_stats(io: std.Io, allocator: std.mem.Allocator, rand: std.Random, chains: *std.ArrayList(processWord), fileName: []const u8, deliminator: u8) !void {
var buffer: [4096]u8 = undefined;
var file = try std.Io.Dir.cwd().openFile(io, data, .{ .mode = .read_only });
var file = try std.Io.Dir.cwd().openFile(io, fileName, .{ .mode = .read_only });
defer file.close(io);
_ = rand;
@@ -147,17 +165,19 @@ pub inline fn do_stats(io: std.Io, allocator: std.mem.Allocator, rand: std.Rando
const buf2 = try allocator.alloc(u8, 4096);
var post_count: f64 = 0;
while (reader.takeDelimiterInclusive('~')) |line| {
while (reader.takeDelimiterInclusive(deliminator)) |line| {
post_count += 1;
const tmp = buf2[0..std.mem.replacementSize(u8, line, "\n", "")];
_ = std.mem.replace(u8, line, "\n", "", tmp);
const tmp = buf2[0..std.mem.replacementSize(u8, line, "\n", " ")];
_ = std.mem.replace(u8, line, "\n", " ", tmp);
var splits = std.mem.splitAny(u8, tmp, " ");
var is_start: bool = true;
while (splits.next()) |x| {
if (splits.peek()) |y| {
try add(allocator, chains, x, y, is_start);
} else try add_end(allocator, chains, x);
is_start = false;
if (x.len >= 1) {
if (splits.peek()) |y| {
try add(allocator, chains, x, y, is_start);
} else try add_end(allocator, chains, x);
is_start = false;
}
}
} else |err| switch (err) {
error.EndOfStream => {},
@@ -176,27 +196,23 @@ pub inline fn do_stats(io: std.Io, allocator: std.mem.Allocator, rand: std.Rando
}
}
pub inline fn random(rand: std.Random, probability: f64, weight: f64) bool {
const ret = rand.float(f64);
return ret <= (probability + weight);
}
// Need to fix
// inline fn do_spongebob(rand: std.Random, line: []u8) []u8 {
// for (line) |*x| {
// if (std.ascii.isAlphabetic(x.*) and std.ascii.isLower(x.*)) {
// if (random(rand, 0.5, 0)) {
// x.* = std.ascii.toUpper(x.*);
// }
// }
// }
// return line;
// }
pub inline fn do_spongebob(rand: std.Random, line: []u8) []u8 {
for (line) |*x| {
if (std.ascii.isAlphabetic(x.*) and std.ascii.isLower(x.*)) {
if (random(rand, 0.5, 0)) {
x.* = std.ascii.toUpper(x.*);
}
}
}
return line;
}
pub inline fn find(chains: *std.ArrayList(Word), word: []u8) ?*Word {
for (chains.items) |*x| {
inline fn find(chains: []Word, word: []const u8) ?*Word {
for (chains) |*x| {
const looking: u64 = fnv(word);
if (x.hash == looking) {
if (std.mem.eql(u8, x.word.items, word)) { // confirm match
if (std.mem.eql(u8, x.word, word)) { // confirm match
return x;
}
}
@@ -204,16 +220,22 @@ pub inline fn find(chains: *std.ArrayList(Word), word: []u8) ?*Word {
return null;
}
pub fn do_next(io: std.Io, allocator: std.mem.Allocator, rand: std.Random, chains: *std.ArrayList(Word)) !?[]u8 {
// Need to make the weight dynamic to the size of the MarkovChain to avoid weirdness
inline fn random(rand: std.Random, probability: f64, weight: f64) bool {
const ret = rand.float(f64) < @min(0.03, probability + weight);
return ret;
}
fn do_next(io: std.Io, allocator: std.mem.Allocator, rand: std.Random, chains: []Word, weight: f64) !?[]u8 {
var post: std.ArrayList(u8) = .empty;
var starter: *Word = undefined;
_ = io;
while (true) {
const i: usize = std.Random.uintAtMost(rand, usize, chains.items.len - 1);
if (chains.items[i].next.items.len > 5 and random(rand, chains.items[i].start_normalized, 0)) {
starter = &chains.items[i];
try post.appendSlice(allocator, starter.word.items);
const i: usize = std.Random.uintAtMost(rand, usize, chains.len - 1);
if (chains[i].next.len > 1 and random(rand, chains[i].start_normalized, 0.0)) {
starter = &chains[i];
try post.appendSlice(allocator, starter.word);
try post.append(allocator, ' ');
break;
}
@@ -222,15 +244,15 @@ pub fn do_next(io: std.Io, allocator: std.mem.Allocator, rand: std.Random, chain
var still_building: bool = true;
while (still_building) {
var selected: bool = false;
var next: []u8 = undefined;
if (random(rand, starter.end_normalized, 0.0)) break;
var next: []const u8 = undefined;
if (random(rand, starter.end_normalized, weight)) break; // Weight Adjustments need to be done dynamically here
while (!selected) {
for (starter.next.items) |*x| {
if (random(rand, x.normalized, 0.01)) {
try post.appendSlice(allocator, x.word.items);
for (starter.next) |*x| {
if (random(rand, x.normalized, weight)) {
try post.appendSlice(allocator, x.word);
try post.append(allocator, ' ');
selected = true;
next = x.word.items;
next = x.word;
break;
}
}
@@ -238,7 +260,7 @@ pub fn do_next(io: std.Io, allocator: std.mem.Allocator, rand: std.Random, chain
if (still_building) {
if (find(chains, next)) |z| { // This should never fail, in theory
starter = z;
if (starter.next.items.len < 1) still_building = false;
if (starter.next.len < 1) still_building = false;
} else still_building = false;
}
}
@@ -246,48 +268,116 @@ pub fn do_next(io: std.Io, allocator: std.mem.Allocator, rand: std.Random, chain
return post.items;
}
// Converts processMarkovChain -> MarkovChain array with slices instead of ArrayList()s
fn processMC(allocator: std.mem.Allocator, processMarkovChain: *std.ArrayList(processWord)) ![]Word {
var MarkovChain: std.ArrayList(Word) = .empty;
for (processMarkovChain.items) |*entry| {
var tmp: std.ArrayList(WordStat) = .empty;
for (entry.next.items) |*next| { // To Slices
try tmp.append(allocator, .{
.word = try next.word.toOwnedSlice(allocator),
.hash = next.hash,
.count = next.count,
.normalized = next.normalized,
});
}
try MarkovChain.append(allocator, .{
.word = try entry.word.toOwnedSlice(allocator),
.next = try tmp.toOwnedSlice(allocator),
.hash = entry.hash,
.end_count = entry.end_count,
.end_normalized = entry.end_normalized,
.start_count = entry.start_count,
.start_normalized = entry.start_normalized,
});
}
return try MarkovChain.toOwnedSlice(allocator);
}
fn dumpToFile(io: std.Io, MarkovChain: *[]Word, fileName: []const u8) !void {
var buffer: [4096]u8 = undefined;
var file = try std.Io.Dir.cwd().createFile(io, fileName, .{});
defer file.close(io);
var file_writer = file.writer(io, &buffer);
const writer = &file_writer.interface;
try zon.stringify.serialize(MarkovChain, .{}, writer);
try writer.flush();
}
fn readFromFile(io: std.Io, allocator: std.mem.Allocator, fileName: []const u8) ![]Word {
const file_contents = try std.Io.Dir.cwd().readFileAlloc(io, fileName, allocator, .unlimited);
// A little messy and not ideal since we're using an arena allocator + need a zero terminated string
const ret = try std.zon.parse.fromSliceAlloc([]Word, allocator, try allocator.dupeSentinel(u8, file_contents, 0), null, .{ .free_on_error = true });
return ret;
}
inline fn printHead() void {
std.debug.print("Markov Chains in Zig (https://occultusterra.com/WWelna/zigkov)\n", .{});
std.debug.print("Copyright (C) 2026 William Welna (wwelna@occultusterra.com)\n\n", .{});
}
pub fn main(init: std.process.Init) !void {
const allocator = init.arena.allocator();
var processMarkovChain: std.ArrayList(processWord) = .empty;
var MarkovChain: []Word = undefined;
var stdout_buffer: [1024]u8 = undefined;
var stdout_file_writer: std.Io.File.Writer = .init(.stdout(), init.io, &stdout_buffer);
var stdout = &stdout_file_writer.interface;
var randbuff: [8]u8 = undefined;
std.Io.random(init.io, &randbuff);
var prng = std.Random.Isaac64.init(std.mem.readInt(u64, randbuff[0..8], .little));
const rand = prng.random();
const params = comptime clap.parseParamsComptime(
\\-h, --help Display this help and exit.
\\-p, --process <str> Read text file & build markov chains.
\\-o, --out <str> Append output to text file.
\\-m, --markov <str> Read/use previously saved markov chain.
\\-s, --spongebob Do The Sponge Bob!
); // Need to add the JSON Loading/Saving Code (am thinking of using zon)
\\-h, --help Display this help and exit.
\\-p, --process <str> Read text file & build markov chains, saves as `filename`.zon.
\\-d, --deliminator <str> Deliminator to split the strings into, default of `.`.
\\-w, --weight <str> Weight of the random() function adjustment, default 0.01.
\\-m, --markov <str> Read/use previously saved markov chain.
\\
//\\-s, --spongebob Do The Sponge Bob!
);
var diag = clap.Diagnostic{};
var res = clap.parse(clap.Help, &params, clap.parsers.default, init.minimal.args, .{
.diagnostic = &diag,
.allocator = allocator,
}) catch |err| {
try diag.reportToFile(init.io, .stderr(), err);
return err;
}) catch {
printHead();
return clap.helpToFile(init.io, .stderr(), clap.Help, &params, .{});
};
defer res.deinit();
if (res.args.help != 0)
if (res.args.help != 0) {
printHead();
return clap.helpToFile(init.io, .stderr(), clap.Help, &params, .{});
}
if (res.args.process) |f| {
var randbuff: [8]u8 = undefined;
std.Io.random(init.io, &randbuff);
const deliminator: u8 = if (res.args.deliminator) |d| d[0] else '.';
try do_stats(init.io, allocator, rand, &processMarkovChain, f, deliminator);
MarkovChain = try processMC(allocator, &processMarkovChain);
var prng = std.Random.Isaac64.init(std.mem.readInt(u64, randbuff[0..8], .little));
const rand = prng.random();
const stem = std.fs.path.stem(f);
const new_file = try std.fmt.allocPrint(allocator, "{s}{s}", .{ stem, ".zon" });
try dumpToFile(init.io, &MarkovChain, new_file);
return;
} else if (res.args.markov) |f| {
MarkovChain = try readFromFile(init.io, allocator, f);
const weight: f64 = if (res.args.weight) |float_str| try std.fmt.parseFloat(f64, float_str) else 0.01;
MarkovChain = .empty;
try do_stats(init.io, allocator, rand, &MarkovChain, f);
if (res.args.spongebob > 0) {
try stdout.print("{s}\n", .{if (try do_next(init.io, allocator, rand, &MarkovChain)) |s| do_spongebob(rand, s) else ""});
} else {
try stdout.print("{s}\n", .{if (try do_next(init.io, allocator, rand, &MarkovChain)) |s| s else ""});
}
try stdout.print("{s}\n", .{if (try do_next(init.io, allocator, rand, MarkovChain, weight)) |s| s else ""});
try stdout.flush();
return;
}
try stdout.flush();
// Not sure of a good way to do this, so here it is.
printHead();
return clap.helpToFile(init.io, .stderr(), clap.Help, &params, .{});
}