Added saving/loading via zon, fixed parsing bugs.
This commit is contained in:
@@ -3,9 +3,27 @@ A implementation of a Markov Chain random post generator in zig.
|
||||
|
||||
## Usage
|
||||
|
||||
```zigkov posts.txt```
|
||||
File contents need to be organized into sentences ending with a deliminator, the default is `.`. The weight function effects the chances of ending the selection of the word group.
|
||||
|
||||
The text file is a flat database of text, with each post seperated by ```~```.
|
||||
```
|
||||
Markov Chains in Zig (https://occultusterra.com/WWelna/zigkov)
|
||||
Copyright (C) 2026 William Welna (wwelna@occultusterra.com)
|
||||
|
||||
-h, --help
|
||||
Display this help and exit.
|
||||
|
||||
-p, --process <str>
|
||||
Read text file & build markov chains, saves as `filename`.zon.
|
||||
|
||||
-d, --deliminator <str>
|
||||
Deliminator to split the strings into, default of `.`.
|
||||
|
||||
-w, --weight <str>
|
||||
Weight of the random() function adjustment, default 0.01.
|
||||
|
||||
-m, --markov <str>
|
||||
Read/use previously saved markov chain.
|
||||
```
|
||||
|
||||
## The No Vibe Coders Open Source License
|
||||
|
||||
|
||||
+79822
File diff suppressed because it is too large
Load Diff
+43749
File diff suppressed because it is too large
Load Diff
+1008538
File diff suppressed because it is too large
Load Diff
+163
-73
@@ -42,10 +42,30 @@
|
||||
|
||||
const std = @import("std");
|
||||
const clap = @import("clap");
|
||||
const zon = std.zon;
|
||||
|
||||
const Word = struct {
|
||||
// For Generating zon
|
||||
const processWord = struct {
|
||||
word: std.ArrayList(u8),
|
||||
next: std.ArrayList(WordStat),
|
||||
next: std.ArrayList(processWordStat),
|
||||
hash: u64,
|
||||
end_count: f64,
|
||||
end_normalized: f64,
|
||||
start_count: f64,
|
||||
start_normalized: f64,
|
||||
};
|
||||
|
||||
const processWordStat = struct {
|
||||
word: std.ArrayList(u8),
|
||||
hash: u64,
|
||||
count: f64,
|
||||
normalized: f64,
|
||||
};
|
||||
|
||||
// For Reading zon and using + Slices instead of ArrayLists
|
||||
const Word = struct {
|
||||
word: []const u8,
|
||||
next: []const WordStat,
|
||||
hash: u64,
|
||||
end_count: f64,
|
||||
end_normalized: f64,
|
||||
@@ -54,15 +74,14 @@ const Word = struct {
|
||||
};
|
||||
|
||||
const WordStat = struct {
|
||||
word: std.ArrayList(u8),
|
||||
word: []const u8,
|
||||
hash: u64,
|
||||
count: f64,
|
||||
normalized: f64,
|
||||
};
|
||||
|
||||
var MarkovChain: std.ArrayList(Word) = undefined;
|
||||
|
||||
pub fn fnv(word: []const u8) u64 {
|
||||
// Hash function for quick searches
|
||||
fn fnv(word: []const u8) u64 {
|
||||
var hash: u64 = 0xcbf29ce484222325;
|
||||
for (word) |byte| {
|
||||
hash *%= 0x100000001b3;
|
||||
@@ -71,16 +90,15 @@ pub fn fnv(word: []const u8) u64 {
|
||||
return hash;
|
||||
}
|
||||
|
||||
pub inline fn clean(allocator: std.mem.Allocator, word: []const u8, output: *std.ArrayList(u8)) !void {
|
||||
//const filter:[]u8 = "$@\\€-_";
|
||||
for (word) |x| { // pretty
|
||||
inline fn clean(allocator: std.mem.Allocator, word: []const u8, output: *std.ArrayList(u8)) !void {
|
||||
if (word.len == 1 and word[0] == 'I') try output.append(allocator, word[0]) else for (word) |x| {
|
||||
if (std.ascii.isAlphabetic(x)) {
|
||||
try output.append(allocator, std.ascii.toLower(x));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub inline fn add(allocator: std.mem.Allocator, chains: *std.ArrayList(Word), word: []const u8, next: []const u8, is_start: bool) !void {
|
||||
inline fn add(allocator: std.mem.Allocator, chains: *std.ArrayList(processWord), word: []const u8, next: []const u8, is_start: bool) !void {
|
||||
var updated: bool = false;
|
||||
var w: std.ArrayList(u8) = .empty;
|
||||
var n: std.ArrayList(u8) = .empty;
|
||||
@@ -110,13 +128,13 @@ pub inline fn add(allocator: std.mem.Allocator, chains: *std.ArrayList(Word), wo
|
||||
}
|
||||
if (updated == false) { // No point setting this as not used again
|
||||
// Add new entry word/next pair
|
||||
var entry: std.ArrayList(WordStat) = .empty;
|
||||
var entry: std.ArrayList(processWordStat) = .empty;
|
||||
try entry.append(allocator, .{ .word = n, .hash = fnv(n.items), .count = 1, .normalized = 0 });
|
||||
try chains.append(allocator, .{ .word = w, .next = entry, .hash = fnv(w.items), .end_count = 0, .end_normalized = 0, .start_count = if (is_start) 1 else 0, .start_normalized = 0 });
|
||||
}
|
||||
}
|
||||
|
||||
pub inline fn add_end(allocator: std.mem.Allocator, chains: *std.ArrayList(Word), word: []const u8) !void {
|
||||
inline fn add_end(allocator: std.mem.Allocator, chains: *std.ArrayList(processWord), word: []const u8) !void {
|
||||
var updated: bool = false;
|
||||
var w: std.ArrayList(u8) = .empty;
|
||||
try clean(allocator, word, &w);
|
||||
@@ -135,9 +153,9 @@ pub inline fn add_end(allocator: std.mem.Allocator, chains: *std.ArrayList(Word)
|
||||
}
|
||||
}
|
||||
|
||||
pub inline fn do_stats(io: std.Io, allocator: std.mem.Allocator, rand: std.Random, chains: *std.ArrayList(Word), data: []const u8) !void {
|
||||
inline fn do_stats(io: std.Io, allocator: std.mem.Allocator, rand: std.Random, chains: *std.ArrayList(processWord), fileName: []const u8, deliminator: u8) !void {
|
||||
var buffer: [4096]u8 = undefined;
|
||||
var file = try std.Io.Dir.cwd().openFile(io, data, .{ .mode = .read_only });
|
||||
var file = try std.Io.Dir.cwd().openFile(io, fileName, .{ .mode = .read_only });
|
||||
defer file.close(io);
|
||||
_ = rand;
|
||||
|
||||
@@ -147,17 +165,19 @@ pub inline fn do_stats(io: std.Io, allocator: std.mem.Allocator, rand: std.Rando
|
||||
const buf2 = try allocator.alloc(u8, 4096);
|
||||
|
||||
var post_count: f64 = 0;
|
||||
while (reader.takeDelimiterInclusive('~')) |line| {
|
||||
while (reader.takeDelimiterInclusive(deliminator)) |line| {
|
||||
post_count += 1;
|
||||
const tmp = buf2[0..std.mem.replacementSize(u8, line, "\n", "")];
|
||||
_ = std.mem.replace(u8, line, "\n", "", tmp);
|
||||
const tmp = buf2[0..std.mem.replacementSize(u8, line, "\n", " ")];
|
||||
_ = std.mem.replace(u8, line, "\n", " ", tmp);
|
||||
var splits = std.mem.splitAny(u8, tmp, " ");
|
||||
var is_start: bool = true;
|
||||
while (splits.next()) |x| {
|
||||
if (splits.peek()) |y| {
|
||||
try add(allocator, chains, x, y, is_start);
|
||||
} else try add_end(allocator, chains, x);
|
||||
is_start = false;
|
||||
if (x.len >= 1) {
|
||||
if (splits.peek()) |y| {
|
||||
try add(allocator, chains, x, y, is_start);
|
||||
} else try add_end(allocator, chains, x);
|
||||
is_start = false;
|
||||
}
|
||||
}
|
||||
} else |err| switch (err) {
|
||||
error.EndOfStream => {},
|
||||
@@ -176,27 +196,23 @@ pub inline fn do_stats(io: std.Io, allocator: std.mem.Allocator, rand: std.Rando
|
||||
}
|
||||
}
|
||||
|
||||
pub inline fn random(rand: std.Random, probability: f64, weight: f64) bool {
|
||||
const ret = rand.float(f64);
|
||||
return ret <= (probability + weight);
|
||||
}
|
||||
// Need to fix
|
||||
// inline fn do_spongebob(rand: std.Random, line: []u8) []u8 {
|
||||
// for (line) |*x| {
|
||||
// if (std.ascii.isAlphabetic(x.*) and std.ascii.isLower(x.*)) {
|
||||
// if (random(rand, 0.5, 0)) {
|
||||
// x.* = std.ascii.toUpper(x.*);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// return line;
|
||||
// }
|
||||
|
||||
pub inline fn do_spongebob(rand: std.Random, line: []u8) []u8 {
|
||||
for (line) |*x| {
|
||||
if (std.ascii.isAlphabetic(x.*) and std.ascii.isLower(x.*)) {
|
||||
if (random(rand, 0.5, 0)) {
|
||||
x.* = std.ascii.toUpper(x.*);
|
||||
}
|
||||
}
|
||||
}
|
||||
return line;
|
||||
}
|
||||
|
||||
pub inline fn find(chains: *std.ArrayList(Word), word: []u8) ?*Word {
|
||||
for (chains.items) |*x| {
|
||||
inline fn find(chains: []Word, word: []const u8) ?*Word {
|
||||
for (chains) |*x| {
|
||||
const looking: u64 = fnv(word);
|
||||
if (x.hash == looking) {
|
||||
if (std.mem.eql(u8, x.word.items, word)) { // confirm match
|
||||
if (std.mem.eql(u8, x.word, word)) { // confirm match
|
||||
return x;
|
||||
}
|
||||
}
|
||||
@@ -204,16 +220,22 @@ pub inline fn find(chains: *std.ArrayList(Word), word: []u8) ?*Word {
|
||||
return null;
|
||||
}
|
||||
|
||||
pub fn do_next(io: std.Io, allocator: std.mem.Allocator, rand: std.Random, chains: *std.ArrayList(Word)) !?[]u8 {
|
||||
// Need to make the weight dynamic to the size of the MarkovChain to avoid weirdness
|
||||
inline fn random(rand: std.Random, probability: f64, weight: f64) bool {
|
||||
const ret = rand.float(f64) < @min(0.03, probability + weight);
|
||||
return ret;
|
||||
}
|
||||
|
||||
fn do_next(io: std.Io, allocator: std.mem.Allocator, rand: std.Random, chains: []Word, weight: f64) !?[]u8 {
|
||||
var post: std.ArrayList(u8) = .empty;
|
||||
var starter: *Word = undefined;
|
||||
_ = io;
|
||||
|
||||
while (true) {
|
||||
const i: usize = std.Random.uintAtMost(rand, usize, chains.items.len - 1);
|
||||
if (chains.items[i].next.items.len > 5 and random(rand, chains.items[i].start_normalized, 0)) {
|
||||
starter = &chains.items[i];
|
||||
try post.appendSlice(allocator, starter.word.items);
|
||||
const i: usize = std.Random.uintAtMost(rand, usize, chains.len - 1);
|
||||
if (chains[i].next.len > 1 and random(rand, chains[i].start_normalized, 0.0)) {
|
||||
starter = &chains[i];
|
||||
try post.appendSlice(allocator, starter.word);
|
||||
try post.append(allocator, ' ');
|
||||
break;
|
||||
}
|
||||
@@ -222,15 +244,15 @@ pub fn do_next(io: std.Io, allocator: std.mem.Allocator, rand: std.Random, chain
|
||||
var still_building: bool = true;
|
||||
while (still_building) {
|
||||
var selected: bool = false;
|
||||
var next: []u8 = undefined;
|
||||
if (random(rand, starter.end_normalized, 0.0)) break;
|
||||
var next: []const u8 = undefined;
|
||||
if (random(rand, starter.end_normalized, weight)) break; // Weight Adjustments need to be done dynamically here
|
||||
while (!selected) {
|
||||
for (starter.next.items) |*x| {
|
||||
if (random(rand, x.normalized, 0.01)) {
|
||||
try post.appendSlice(allocator, x.word.items);
|
||||
for (starter.next) |*x| {
|
||||
if (random(rand, x.normalized, weight)) {
|
||||
try post.appendSlice(allocator, x.word);
|
||||
try post.append(allocator, ' ');
|
||||
selected = true;
|
||||
next = x.word.items;
|
||||
next = x.word;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -238,7 +260,7 @@ pub fn do_next(io: std.Io, allocator: std.mem.Allocator, rand: std.Random, chain
|
||||
if (still_building) {
|
||||
if (find(chains, next)) |z| { // This should never fail, in theory
|
||||
starter = z;
|
||||
if (starter.next.items.len < 1) still_building = false;
|
||||
if (starter.next.len < 1) still_building = false;
|
||||
} else still_building = false;
|
||||
}
|
||||
}
|
||||
@@ -246,48 +268,116 @@ pub fn do_next(io: std.Io, allocator: std.mem.Allocator, rand: std.Random, chain
|
||||
return post.items;
|
||||
}
|
||||
|
||||
// Converts processMarkovChain -> MarkovChain array with slices instead of ArrayList()s
|
||||
fn processMC(allocator: std.mem.Allocator, processMarkovChain: *std.ArrayList(processWord)) ![]Word {
|
||||
var MarkovChain: std.ArrayList(Word) = .empty;
|
||||
for (processMarkovChain.items) |*entry| {
|
||||
var tmp: std.ArrayList(WordStat) = .empty;
|
||||
for (entry.next.items) |*next| { // To Slices
|
||||
try tmp.append(allocator, .{
|
||||
.word = try next.word.toOwnedSlice(allocator),
|
||||
.hash = next.hash,
|
||||
.count = next.count,
|
||||
.normalized = next.normalized,
|
||||
});
|
||||
}
|
||||
try MarkovChain.append(allocator, .{
|
||||
.word = try entry.word.toOwnedSlice(allocator),
|
||||
.next = try tmp.toOwnedSlice(allocator),
|
||||
.hash = entry.hash,
|
||||
.end_count = entry.end_count,
|
||||
.end_normalized = entry.end_normalized,
|
||||
.start_count = entry.start_count,
|
||||
.start_normalized = entry.start_normalized,
|
||||
});
|
||||
}
|
||||
return try MarkovChain.toOwnedSlice(allocator);
|
||||
}
|
||||
|
||||
fn dumpToFile(io: std.Io, MarkovChain: *[]Word, fileName: []const u8) !void {
|
||||
var buffer: [4096]u8 = undefined;
|
||||
var file = try std.Io.Dir.cwd().createFile(io, fileName, .{});
|
||||
defer file.close(io);
|
||||
|
||||
var file_writer = file.writer(io, &buffer);
|
||||
const writer = &file_writer.interface;
|
||||
|
||||
try zon.stringify.serialize(MarkovChain, .{}, writer);
|
||||
|
||||
try writer.flush();
|
||||
}
|
||||
|
||||
fn readFromFile(io: std.Io, allocator: std.mem.Allocator, fileName: []const u8) ![]Word {
|
||||
const file_contents = try std.Io.Dir.cwd().readFileAlloc(io, fileName, allocator, .unlimited);
|
||||
// A little messy and not ideal since we're using an arena allocator + need a zero terminated string
|
||||
const ret = try std.zon.parse.fromSliceAlloc([]Word, allocator, try allocator.dupeSentinel(u8, file_contents, 0), null, .{ .free_on_error = true });
|
||||
return ret;
|
||||
}
|
||||
|
||||
inline fn printHead() void {
|
||||
std.debug.print("Markov Chains in Zig (https://occultusterra.com/WWelna/zigkov)\n", .{});
|
||||
std.debug.print("Copyright (C) 2026 William Welna (wwelna@occultusterra.com)\n\n", .{});
|
||||
}
|
||||
|
||||
pub fn main(init: std.process.Init) !void {
|
||||
const allocator = init.arena.allocator();
|
||||
|
||||
var processMarkovChain: std.ArrayList(processWord) = .empty;
|
||||
var MarkovChain: []Word = undefined;
|
||||
|
||||
var stdout_buffer: [1024]u8 = undefined;
|
||||
var stdout_file_writer: std.Io.File.Writer = .init(.stdout(), init.io, &stdout_buffer);
|
||||
var stdout = &stdout_file_writer.interface;
|
||||
|
||||
var randbuff: [8]u8 = undefined;
|
||||
std.Io.random(init.io, &randbuff);
|
||||
|
||||
var prng = std.Random.Isaac64.init(std.mem.readInt(u64, randbuff[0..8], .little));
|
||||
const rand = prng.random();
|
||||
|
||||
const params = comptime clap.parseParamsComptime(
|
||||
\\-h, --help Display this help and exit.
|
||||
\\-p, --process <str> Read text file & build markov chains.
|
||||
\\-o, --out <str> Append output to text file.
|
||||
\\-m, --markov <str> Read/use previously saved markov chain.
|
||||
\\-s, --spongebob Do The Sponge Bob!
|
||||
); // Need to add the JSON Loading/Saving Code (am thinking of using zon)
|
||||
\\-h, --help Display this help and exit.
|
||||
\\-p, --process <str> Read text file & build markov chains, saves as `filename`.zon.
|
||||
\\-d, --deliminator <str> Deliminator to split the strings into, default of `.`.
|
||||
\\-w, --weight <str> Weight of the random() function adjustment, default 0.01.
|
||||
\\-m, --markov <str> Read/use previously saved markov chain.
|
||||
\\
|
||||
//\\-s, --spongebob Do The Sponge Bob!
|
||||
);
|
||||
|
||||
var diag = clap.Diagnostic{};
|
||||
var res = clap.parse(clap.Help, ¶ms, clap.parsers.default, init.minimal.args, .{
|
||||
.diagnostic = &diag,
|
||||
.allocator = allocator,
|
||||
}) catch |err| {
|
||||
try diag.reportToFile(init.io, .stderr(), err);
|
||||
return err;
|
||||
}) catch {
|
||||
printHead();
|
||||
return clap.helpToFile(init.io, .stderr(), clap.Help, ¶ms, .{});
|
||||
};
|
||||
defer res.deinit();
|
||||
|
||||
if (res.args.help != 0)
|
||||
if (res.args.help != 0) {
|
||||
printHead();
|
||||
return clap.helpToFile(init.io, .stderr(), clap.Help, ¶ms, .{});
|
||||
}
|
||||
if (res.args.process) |f| {
|
||||
var randbuff: [8]u8 = undefined;
|
||||
std.Io.random(init.io, &randbuff);
|
||||
const deliminator: u8 = if (res.args.deliminator) |d| d[0] else '.';
|
||||
try do_stats(init.io, allocator, rand, &processMarkovChain, f, deliminator);
|
||||
MarkovChain = try processMC(allocator, &processMarkovChain);
|
||||
|
||||
var prng = std.Random.Isaac64.init(std.mem.readInt(u64, randbuff[0..8], .little));
|
||||
const rand = prng.random();
|
||||
const stem = std.fs.path.stem(f);
|
||||
const new_file = try std.fmt.allocPrint(allocator, "{s}{s}", .{ stem, ".zon" });
|
||||
try dumpToFile(init.io, &MarkovChain, new_file);
|
||||
return;
|
||||
} else if (res.args.markov) |f| {
|
||||
MarkovChain = try readFromFile(init.io, allocator, f);
|
||||
const weight: f64 = if (res.args.weight) |float_str| try std.fmt.parseFloat(f64, float_str) else 0.01;
|
||||
|
||||
MarkovChain = .empty;
|
||||
try do_stats(init.io, allocator, rand, &MarkovChain, f);
|
||||
if (res.args.spongebob > 0) {
|
||||
try stdout.print("{s}\n", .{if (try do_next(init.io, allocator, rand, &MarkovChain)) |s| do_spongebob(rand, s) else ""});
|
||||
} else {
|
||||
try stdout.print("{s}\n", .{if (try do_next(init.io, allocator, rand, &MarkovChain)) |s| s else ""});
|
||||
}
|
||||
try stdout.print("{s}\n", .{if (try do_next(init.io, allocator, rand, MarkovChain, weight)) |s| s else ""});
|
||||
try stdout.flush();
|
||||
return;
|
||||
}
|
||||
|
||||
try stdout.flush();
|
||||
// Not sure of a good way to do this, so here it is.
|
||||
printHead();
|
||||
return clap.helpToFile(init.io, .stderr(), clap.Help, ¶ms, .{});
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user