From b870bbd11aa24c171709528e88e8911012c778dc Mon Sep 17 00:00:00 2001
From: Vladimir Panteleev <git@cy.md>
Date: Thu, 19 Sep 2024 09:39:20 +0000
Subject: [PATCH] Update DustMite

Update DustMite, from the upstream commit cb0855d (+ the backported
fix) to the upstream commit ab68756.

* 6634109 splitter: Add null split mode
* 1f40c63 dustmite: Add --reject
* bba58a4 dustmite: Add --dump-json
* ca93991 dustmite: Fix a placeholder variable name in --help
* 4e38836 dustmite: Add --json
* e504ccc dustmite: Remove use of newer D features
* b173b89 Add support for reading from stdin
* bb73f5e tests: Run dustmite in the test's directory
* f4d2fa7 Remove the extension stripping for single-file input
* 5b5e261 Move JSON loading into splitter module
* 1bf5b4b dustmite: Print informational messages to standard error
* 61b41ae splitter: Don't replace \ with / on POSIX
* 461271f dustmite: Allow omitting the mask portion of --split
* faee9d3 splitter: Refactor
* fd28eb1 splitter: Fix compilation on 32 bits
* 67e676f dustmite: Add --temp-dir
* f594d0a dustmite: More explicit File finalization in DiskWriter
* 372b4a0 dustmite: Fix stdin on MSVCRT
* 4e4a303 README: Update badges
* 9696d83 README: Add illustration
* 56cdd01 README: Show CI status for master branch
* 4da8847 Use skipOver
* b8703eb polyhash: Update GDC assembler syntax
* fd8c598 fuzz: Cast allAddresses length to float
* 62f7899 dustmite: Fix lookback and pingpong strategies reducing to empty set
* bc5f5d6 splitter: Print used splitter when loading files
* c855596 Refactor Entity file properties into separate structure
* 190c180 dustmite: Refactor look-ahead thread handling
* 07ba696 dustmite: Handle errors
* 71362ea Preserve file attributes
* ed78a2e Add support for symbolic links
* 675c757 dustmite: Remove explicit Writer destructors
* ec5c25d Preserve file timestamps
* 7482c9d Preserve directories (incl. empty directories) and their attributes
* 4688772 splitter: Add lisp splitter
* 5b65ade dustmite: Add some assertions
* d0af4a5 splitter: Don't try to reduce module names in D source code
* 178f916 splitter: Smarter diff splitting
* 5f2ac60 splitter: Add debug check to ensure splitting preserves contents
* a093fae splitter: Fix firstHead
* 00c2400 splitter: Improve indent splitter
* 1c54227 splitter: Refactor parseIndent
* ab68756 dustmite: Add --man
---
 DustMite/dustmite.d | 668 ++++++++++++++++++++++++++++++++-----------
 DustMite/polyhash.d |   8 +-
 DustMite/splitter.d | 677 ++++++++++++++++++++++++++++++++++++++------
 3 files changed, 1094 insertions(+), 259 deletions(-)

diff --git a/DustMite/dustmite.d b/DustMite/dustmite.d
index ec7abf262d..e8876b7f90 100644
--- a/DustMite/dustmite.d
+++ b/DustMite/dustmite.d
@@ -24,7 +24,7 @@ import std.process;
 import std.random;
 import std.range;
 import std.regex;
-import std.stdio;
+import std.stdio : stdout, stderr, File;
 import std.string;
 import std.typecons;
 
@@ -33,26 +33,33 @@ import splitter;
 alias Splitter = splitter.Splitter;
 
 // Issue 314 workarounds
-alias std.string.join join;
-alias std.string.startsWith startsWith;
+alias join = std.string.join;
+alias startsWith = std.algorithm.searching.startsWith;
 
-string dir, resultDir, tester, globalCache;
-string dirSuffix(string suffix) { return (dir.absolutePath().buildNormalizedPath() ~ "." ~ suffix).relativePath(); }
+string dir, resultDir, tmpDir, tester, globalCache;
+string dirSuffix(string suffix, Flag!q{temp} temp)
+{
+	return (
+		(temp && tmpDir ? tmpDir.buildPath(dir.baseName) : dir)
+		.absolutePath().buildNormalizedPath() ~ "." ~ suffix
+	).relativePath();
+}
 
 size_t maxBreadth;
 size_t origDescendants;
 int tests, maxSteps = -1; bool foundAnything;
 bool noSave, trace, noRedirect, doDump, whiteout;
+RemoveRule[] rejectRules;
 string strategy = "inbreadth";
 
 struct Times { StopWatch total, load, testSave, resultSave, apply, lookaheadApply, lookaheadWaitThread, lookaheadWaitProcess, test, clean, globalCache, misc; }
 Times times;
 static this() { times.total.start(); times.misc.start(); }
-void measure(string what)(scope void delegate() p)
+T measure(string what, T)(scope T delegate() p)
 {
 	times.misc.stop(); mixin("times."~what~".start();");
-	p();
-	mixin("times."~what~".stop();"); times.misc.start();
+	scope(exit) { mixin("times."~what~".stop();"); times.misc.start(); }
+	return p();
 }
 
 struct Reduction
@@ -140,29 +147,31 @@ struct RemoveRule { Regex!char regexp; string shellGlob; bool remove; }
 
 int main(string[] args)
 {
-	bool force, dumpHtml, showTimes, stripComments, obfuscate, fuzz, keepLength, showHelp, showVersion, noOptimize, inPlace;
+	bool force, dumpHtml, dumpJson, readJson, showTimes, stripComments, obfuscate, fuzz, keepLength, showHelp, openWiki, showVersion, noOptimize, inPlace;
 	string coverageDir;
 	RemoveRule[] removeRules;
 	string[] splitRules;
 	uint lookaheadCount, tabWidth = 8;
 
-	args = args.filter!(
-		(arg)
-		{
-			if (arg.startsWith("-j"))
+	args = args
+		.filter!((string arg) {
+			if (arg.skipOver("-j"))
 			{
-				arg = arg[2..$];
 				lookaheadCount = arg.length ? arg.to!uint : totalCPUs;
 				return false;
 			}
 			return true;
-		}).array();
+		})
+		// Work around getopt's inability to handle "-" in 2.080.0
+		.map!((string arg) => arg == "-" ? "\0" ~ arg : arg)
+		.array();
 
 	getopt(args,
 		"force", &force,
 		"reduceonly|reduce-only", (string opt, string value) { removeRules ~= RemoveRule(Regex!char.init, value, true); },
 		"remove"                , (string opt, string value) { removeRules ~= RemoveRule(regex(value, "mg"), null, true); },
 		"noremove|no-remove"    , (string opt, string value) { removeRules ~= RemoveRule(regex(value, "mg"), null, false); },
+		"reject"                , (string opt, string value) { rejectRules ~= RemoveRule(regex(value, "mg"), null, true); },
 		"strip-comments", &stripComments,
 		"whiteout|white-out", &whiteout,
 		"coverage", &coverageDir,
@@ -173,6 +182,7 @@ int main(string[] args)
 		"split", &splitRules,
 		"dump", &doDump,
 		"dump-html", &dumpHtml,
+		"dump-json", &dumpJson,
 		"times", &showTimes,
 		"noredirect|no-redirect", &noRedirect,
 		"cache", &globalCache, // for research
@@ -180,11 +190,16 @@ int main(string[] args)
 		"nosave|no-save", &noSave, // for research
 		"nooptimize|no-optimize", &noOptimize, // for research
 		"tab-width", &tabWidth,
+		"temp-dir", &tmpDir,
 		"max-steps", &maxSteps, // for research / benchmarking
 		"i|in-place", &inPlace,
+		"json", &readJson,
 		"h|help", &showHelp,
+		"man", &openWiki,
 		"V|version", &showVersion,
 	);
+	foreach (ref arg; args)
+		arg.skipOver("\0"); // Undo getopt hack
 
 	if (showVersion)
 	{
@@ -195,7 +210,14 @@ int main(string[] args)
 			enum source = import("source");
 		else
 			enum source = "upstream";
-		writeln("DustMite build ", __DATE__, " (", source, "), built with ", __VENDOR__, " ", __VERSION__);
+		stdout.writeln("DustMite build ", __DATE__, " (", source, "), built with ", __VENDOR__, " ", __VERSION__);
+		if (args.length == 1)
+			return 0;
+	}
+
+	if (openWiki)
+	{
+		browse("https://github.com/CyberShadow/DustMite/wiki");
 		if (args.length == 1)
 			return 0;
 	}
@@ -204,8 +226,7 @@ int main(string[] args)
 	{
 		stderr.writef(q"EOS
 Usage: %s [OPTION]... PATH TESTER
-PATH should be a directory containing a clean copy of the file-set to reduce.
-A file path can also be specified. NAME.EXT will be treated like NAME/NAME.EXT.
+PATH should contain a clean copy of the file-set to reduce.
 TESTER should be a shell command which returns 0 for a correct reduction,
 and anything else otherwise.
 Supported options:
@@ -216,6 +237,8 @@ Supported options:
                        (may be used multiple times)
   --no-remove REGEXP Do not reduce blocks containing REGEXP
                        (may be used multiple times)
+  --reject REGEXP    Reject reductions which cause REGEXP to occur in output
+                       (may be used multiple times)
   --strip-comments   Attempt to remove comments from source code
   --white-out        Replace deleted text with spaces to preserve line numbers
   --coverage DIR     Load .lst files corresponding to source files from DIR
@@ -227,7 +250,9 @@ Supported options:
   --split MASK:MODE  Parse and reduce files specified by MASK using the given
                        splitter. Can be repeated. MODE must be one of:
                        %-(%s, %)
+  --json             Load PATH as a JSON file (same syntax as --dump-json)
   --no-redirect      Don't redirect stdout/stderr streams of test command
+  --temp-dir         Write and run reduction candidates in this directory
   -j[N]              Use N look-ahead processes (%d by default)
 EOS", args[0], splitterNames, totalCPUs);
 
@@ -242,10 +267,12 @@ EOS");
 			stderr.write(q"EOS
   -h, --help         Show this message
 Less interesting options:
+  --man              Launch the project wiki web page in a web browser
   -V, --version      Show program version
   --strategy STRAT   Set strategy (careful/lookback/pingpong/indepth/inbreadth)
-  --dump             Dump parsed tree to DIR.dump file
-  --dump-html        Dump parsed tree to DIR.html file
+  --dump             Dump parsed tree to PATH.dump file
+  --dump-html        Dump parsed tree to PATH.html file
+  --dump-json        Dump parsed tree to PATH.json file
   --times            Display verbose spent time breakdown
   --cache DIR        Use DIR as persistent disk cache
                        (in addition to memory cache)
@@ -278,23 +305,34 @@ EOS");
 
 	bool isDotName(string fn) { return fn.startsWith(".") && !(fn=="." || fn==".."); }
 
-	bool suspiciousFilesFound;
-	if (!force && isDir(dir))
+	if (!readJson && !force && dir.exists && dir.isDir())
+	{
+		bool suspiciousFilesFound;
 		foreach (string path; dirEntries(dir, SpanMode.breadth))
 			if (isDotName(baseName(path)) || isDotName(baseName(dirName(path))) || extension(path)==".o" || extension(path)==".obj" || extension(path)==".exe")
 			{
 				stderr.writeln("Warning: Suspicious file found: ", path);
 				suspiciousFilesFound = true;
 			}
-	if (suspiciousFilesFound)
-		stderr.writeln("You should use a clean copy of the source tree.\nIf it was your intention to include this file in the file-set to be reduced,\nyou can use --force to silence this message.");
+		if (suspiciousFilesFound)
+			stderr.writeln("You should use a clean copy of the source tree.\nIf it was your intention to include this file in the file-set to be reduced,\nyou can use --force to silence this message.");
+	}
 
 	ParseRule parseSplitRule(string rule)
 	{
 		auto p = rule.lastIndexOf(':');
-		enforce(p > 0, "Invalid parse rule: " ~ rule);
-		auto pattern = rule[0..p];
-		auto splitterName = rule[p+1..$];
+		string pattern, splitterName;
+		if (p < 0)
+		{
+			pattern = "*";
+			splitterName = rule;
+		}
+		else
+		{
+			enforce(p > 0, "Invalid parse rule: " ~ rule);
+			pattern = rule[0 .. p];
+			splitterName = rule[p + 1 .. $];
+		}
 		auto splitterIndex = splitterNames.countUntil(splitterName);
 		enforce(splitterIndex >= 0, "Unknown splitter: " ~ splitterName);
 		return ParseRule(pattern, cast(Splitter)splitterIndex);
@@ -304,7 +342,10 @@ EOS");
 
 	ParseOptions parseOptions;
 	parseOptions.stripComments = stripComments;
-	parseOptions.mode = obfuscate ? ParseOptions.Mode.words : ParseOptions.Mode.source;
+	parseOptions.mode =
+		readJson ? ParseOptions.Mode.json :
+		obfuscate ? ParseOptions.Mode.words :
+		ParseOptions.Mode.source;
 	parseOptions.rules = splitRules.map!parseSplitRule().array();
 	parseOptions.tabWidth = tabWidth;
 	measure!"load"({root = loadFiles(dir, parseOptions);});
@@ -324,13 +365,15 @@ EOS");
 	resetProgress(root);
 
 	if (doDump)
-		dumpSet(root, dirSuffix("dump"));
+		dumpSet(root, dirSuffix("dump", No.temp));
 	if (dumpHtml)
-		dumpToHtml(root, dirSuffix("html"));
+		dumpToHtml(root, dirSuffix("html", No.temp));
+	if (dumpJson)
+		dumpToJson(root, dirSuffix("json", No.temp));
 
 	if (tester is null)
 	{
-		writeln("No tester specified, exiting");
+		stderr.writeln("No tester specified, exiting");
 		return 0;
 	}
 
@@ -338,10 +381,10 @@ EOS");
 		resultDir = dir;
 	else
 	{
-		resultDir = dirSuffix("reduced");
+		resultDir = dirSuffix("reduced", No.temp);
 		if (resultDir.exists)
 		{
-			writeln("Hint: read https://github.com/CyberShadow/DustMite/wiki#result-directory-already-exists");
+			stderr.writeln("Hint: read https://github.com/CyberShadow/DustMite/wiki#result-directory-already-exists");
 			throw new Exception("Result directory already exists");
 		}
 	}
@@ -355,20 +398,20 @@ EOS");
 			version (Posix)
 			{
 				if (testerFile.exists && (testerFile.getAttributes() & octal!111) == 0)
-					writeln("Hint: test program seems to be a non-executable file, try: chmod +x " ~ testerFile.escapeShellFileName());
+					stderr.writeln("Hint: test program seems to be a non-executable file, try: chmod +x " ~ testerFile.escapeShellFileName());
 			}
 			if (!testerFile.exists && tester.exists)
-				writeln("Hint: test program path should be relative to the source directory, try " ~
+				stderr.writeln("Hint: test program path should be relative to the source directory, try " ~
 					tester.absolutePath.relativePath(dir.absolutePath).escapeShellFileName() ~
 					" instead of " ~ tester.escapeShellFileName());
 			if (!noRedirect)
-				writeln("Hint: use --no-redirect to see test script output");
-			writeln("Hint: read https://github.com/CyberShadow/DustMite/wiki#initial-test-fails");
+				stderr.writeln("Hint: use --no-redirect to see test script output");
+			stderr.writeln("Hint: read https://github.com/CyberShadow/DustMite/wiki#initial-test-fails");
 			throw new Exception("Initial test fails: " ~ nullResult.reason);
 		}
 	}
 
-	lookaheadProcesses = new Lookahead[lookaheadCount];
+	lookaheadProcessSlots = new LookaheadSlot[lookaheadCount];
 
 	foundAnything = false;
 	string resultAdjective;
@@ -397,20 +440,20 @@ EOS");
 		{
 			if (noSave)
 				measure!"resultSave"({safeSave(root, resultDir);});
-			writefln("Done in %s tests and %s; %s version is in %s", tests, duration, resultAdjective, resultDir);
+			stderr.writefln("Done in %s tests and %s; %s version is in %s", tests, duration, resultAdjective, resultDir);
 		}
 		else
 		{
-			writeln("Hint: read https://github.com/CyberShadow/DustMite/wiki#reduced-to-empty-set");
-			writefln("Done in %s tests and %s; %s to empty set", tests, duration, resultAdjective);
+			stderr.writeln("Hint: read https://github.com/CyberShadow/DustMite/wiki#reduced-to-empty-set");
+			stderr.writefln("Done in %s tests and %s; %s to empty set", tests, duration, resultAdjective);
 		}
 	}
 	else
-		writefln("Done in %s tests and %s; no reductions found", tests, duration);
+		stderr.writefln("Done in %s tests and %s; no reductions found", tests, duration);
 
 	if (showTimes)
 		foreach (i, t; times.tupleof)
-			writefln("%s: %s", times.tupleof[i].stringof, times.tupleof[i].peek());
+			stderr.writefln("%s: %s", times.tupleof[i].stringof, times.tupleof[i].peek());
 
 	return 0;
 }
@@ -494,7 +537,8 @@ void recalculate(Entity root)
 						e.deadHash.put(c.isWhite ? c : ' ');
 			}
 
-			putString(e.filename);
+			if (e.file)
+				putString(e.file.name);
 			putString(e.head);
 
 			void addDependents(R)(R range, bool fresh)
@@ -602,7 +646,7 @@ void recalculate(Entity root)
 				return;
 			}
 
-			inFile |= e.isFile;
+			inFile |= e.file !is null;
 
 			assert(e.hash.length == e.deadHash.length);
 
@@ -618,7 +662,8 @@ void recalculate(Entity root)
 
 			auto start = pos;
 
-			putString(e.filename);
+			if (e.file)
+				putString(e.file.name);
 			putString(e.head);
 			foreach (c; e.children)
 				passWO(c, inFile);
@@ -778,7 +823,7 @@ struct ReductionIterator
 					// Try next reduction type
 					type = Reduction.Type.Concat;
 
-					if (e.isFile)
+					if (e.file)
 						return; // Try this
 					else
 					{
@@ -971,7 +1016,7 @@ bool nextAddress(ref size_t[] address, Entity root, bool descend)
 
 class LevelStrategy : IterativeStrategy
 {
-	bool levelChanged;
+	bool levelChanged; // We found some reductions while traversing this level
 	bool invalid;
 
 	override int getDepth() { return cast(int)address.length; }
@@ -1084,22 +1129,18 @@ final class LookbackStrategy : LevelStrategy
 		if (!nextInLevel())
 		{
 			// End of level
-			if (levelChanged)
-			{
-				setLevel(currentLevel ? currentLevel - 1 : 0);
-			}
-			else
-			if (setLevel(maxLevel + 1))
-			{
-				maxLevel = currentLevel;
-			}
-			else
+			auto nextLevel = levelChanged
+				? currentLevel ? currentLevel - 1 : 0
+				: maxLevel + 1;
+			if (!setLevel(nextLevel))
 			{
 				if (iterationChanged)
 					nextIteration();
 				else
 					done = true;
 			}
+			else
+				maxLevel = max(maxLevel, currentLevel);
 		}
 	}
 }
@@ -1119,12 +1160,10 @@ final class PingPongStrategy : LevelStrategy
 		if (!nextInLevel())
 		{
 			// End of level
-			if (levelChanged)
-			{
-				setLevel(currentLevel ? currentLevel - 1 : 0);
-			}
-			else
-			if (!setLevel(currentLevel + 1))
+			auto nextLevel = levelChanged
+				? currentLevel ? currentLevel - 1 : 0
+				: currentLevel + 1;
+			if (!setLevel(nextLevel))
 			{
 				if (iterationChanged)
 					nextIteration();
@@ -1202,12 +1241,12 @@ void reduceByStrategy(Strategy strategy)
 
 		if (lastIteration != strategy.getIteration())
 		{
-			writefln("############### ITERATION %d ################", strategy.getIteration());
+			stderr.writefln("############### ITERATION %d ################", strategy.getIteration());
 			lastIteration = strategy.getIteration();
 		}
 		if (lastDepth != strategy.getDepth())
 		{
-			writefln("============= Depth %d =============", strategy.getDepth());
+			stderr.writefln("============= Depth %d =============", strategy.getDepth());
 			lastDepth = strategy.getDepth();
 		}
 		if (lastProgressGeneration != strategy.progressGeneration)
@@ -1259,7 +1298,7 @@ void obfuscate(ref Entity root, bool keepLength)
 
 	foreach (f; root.children)
 	{
-		foreach (entity; parseToWords(f.filename) ~ f.children)
+		foreach (entity; parseToWords(f.file ? f.file.name : null) ~ f.children)
 			if (entity.head.length && !isDigit(entity.head[0]))
 				if (entity.head !in wordSet)
 				{
@@ -1373,20 +1412,24 @@ void dump(Writer)(Entity root, Writer writer)
 		if (e.dead)
 		{
 			if (inFile && e.contents.length)
-				writer.handleText(e.contents[e.filename.length .. $]);
+				writer.handleText(e.contents[(e.file ? e.file.name : null).length .. $]);
 		}
 		else
-		if (!inFile && e.isFile)
+		if (!inFile && e.file)
 		{
-			writer.handleFile(e.filename);
+			writer.handleFile(e.file);
 			foreach (c; e.children)
 				dumpEntity!true(c);
 		}
 		else
 		{
 			if (inFile && e.head.length) writer.handleText(e.head);
-			foreach (c; e.children)
-				dumpEntity!inFile(c);
+			if (inFile)
+				foreach (c; e.children)
+					dumpEntity!inFile(c);
+			else // Create files in reverse order, so that directories' timestamps get set last
+				foreach_reverse (c; e.children)
+					dumpEntity!inFile(c);
 			if (inFile && e.tail.length) writer.handleText(e.tail);
 		}
 	}
@@ -1398,59 +1441,194 @@ static struct FastWriter(Next) /// Accelerates Writer interface by bulking conti
 {
 	Next next;
 	immutable(char)* start, end;
-	void finish()
+
+	private void flush()
 	{
 		if (start != end)
 			next.handleText(start[0 .. end - start]);
 		start = end = null;
 	}
-	void handleFile(string s)
+
+	void handleFile(const(Entity.FileProperties)* fileProperties)
 	{
-		finish();
-		next.handleFile(s);
+		flush();
+		next.handleFile(fileProperties);
 	}
+
 	void handleText(string s)
 	{
 		if (s.ptr != end)
 		{
-			finish();
+			flush();
 			start = s.ptr;
 		}
 		end = s.ptr + s.length;
 	}
-	~this() { finish(); }
+
+	void finish()
+	{
+		flush();
+		next.finish();
+	}
 }
 
-void save(Entity root, string savedir)
+// Workaround for https://issues.dlang.org/show_bug.cgi?id=23683
+// Remove when moving to a DMD version incorporating a fix
+version (Windows)
 {
-	safeDelete(savedir);
-	safeMkdir(savedir);
+	import core.sys.windows.winbase;
+	import core.sys.windows.winnt;
+	import std.windows.syserror;
 
-	static struct DiskWriter
+	alias AliasSeq(Args...) = Args;
+	alias FSChar = WCHAR;
+	void setTimes(const(char)[] name,
+				  SysTime accessTime,
+				  SysTime modificationTime)
 	{
-		string dir;
+		auto namez = (name ~ "\0").to!(FSChar[]).ptr;
 
-		File o;
-		typeof(o.lockingBinaryWriter()) binaryWriter;
+		import std.datetime.systime : SysTimeToFILETIME;
+		const ta = SysTimeToFILETIME(accessTime);
+		const tm = SysTimeToFILETIME(modificationTime);
+		alias defaults =
+			AliasSeq!(FILE_WRITE_ATTRIBUTES,
+					  0,
+					  null,
+					  OPEN_EXISTING,
+					  FILE_ATTRIBUTE_NORMAL |
+					  FILE_ATTRIBUTE_DIRECTORY |
+					  FILE_FLAG_BACKUP_SEMANTICS,
+					  HANDLE.init);
+		auto h = CreateFileW(namez, defaults);
 
-		void handleFile(string fn)
-		{
-			static Appender!(char[]) pathBuf;
-			pathBuf.clear();
-			pathBuf.put(dir.chainPath(fn));
-			auto path = pathBuf.data;
-			if (!exists(dirName(path)))
-				safeMkdir(dirName(path));
+		wenforce(h != INVALID_HANDLE_VALUE, "CreateFileW: " ~ name);
+
+		scope(exit)
+			wenforce(CloseHandle(h), "CloseHandle: " ~ name);
+
+		wenforce(SetFileTime(h, null, &ta, &tm), "SetFileTime: " ~ name);
+	}
+}
+
+static struct DiskWriter
+{
+	string dir;
+
+	const(Entity.FileProperties)* fileProperties;
 
+	// Regular files
+	File o;
+	typeof(o.lockingBinaryWriter()) binaryWriter;
+	// Symlinks
+	Appender!(char[]) symlinkBuf;
+
+	@property const(char)[] currentFilePath()
+	{
+		static Appender!(char[]) pathBuf;
+		pathBuf.clear();
+		pathBuf.put(dir.chainPath(fileProperties.name));
+		return pathBuf.data;
+	}
+
+	void handleFile(const(Entity.FileProperties)* fileProperties)
+	{
+		finish();
+
+		this.fileProperties = fileProperties;
+		scope(failure) this.fileProperties = null;
+
+		auto path = currentFilePath;
+		if (!exists(dirName(path)))
+			safeMkdir(dirName(path)); // TODO make directories nested instead
+
+		if (attrIsSymlink(fileProperties.mode.get(0)))
+			symlinkBuf.clear();
+		else
+		if (attrIsDir(fileProperties.mode.get(0)))
+		{}
+		else // regular file
+		{
 			o.open(cast(string)path, "wb");
 			binaryWriter = o.lockingBinaryWriter;
 		}
+	}
 
-		void handleText(string s)
+	void handleText(string s)
+	{
+		if (attrIsSymlink(fileProperties.mode.get(0)))
+			symlinkBuf.put(s);
+		else
+		if (attrIsDir(fileProperties.mode.get(0)))
+			enforce(s.length == 0, "Directories cannot have contents");
+		else // regular file
 		{
+			assert(o.isOpen);
 			binaryWriter.put(s);
 		}
 	}
+
+	void finish()
+	{
+		if (fileProperties)
+		{
+			scope(exit) fileProperties = null;
+
+			auto path = currentFilePath;
+
+			if (attrIsSymlink(fileProperties.mode.get(0)))
+				symlink(symlinkBuf.data, path);
+			else
+			if (attrIsDir(fileProperties.mode.get(0)))
+				mkdirRecurse(path);
+			else // regular file
+			{
+				assert(o.isOpen);
+				binaryWriter = typeof(binaryWriter).init;
+				o.close();
+				o = File.init; // Avoid crash on Windows
+			}
+
+			if (!fileProperties.mode.isNull)
+			{
+				auto mode = fileProperties.mode.get();
+				if (!attrIsSymlink(mode))
+					setAttributes(path, mode);
+			}
+			if (!fileProperties.times.isNull)
+				setTimes(path, fileProperties.times.get()[0], fileProperties.times.get()[1]);
+		}
+	}
+}
+
+struct MemoryWriter
+{
+	char[] buf;
+	size_t pos;
+
+	void handleFile(const(Entity.FileProperties)* fileProperties) {}
+
+	void handleText(string s)
+	{
+		auto end = pos + s.length;
+		if (buf.length < end)
+		{
+			buf.length = end;
+			buf.length = buf.capacity;
+		}
+		buf[pos .. end] = s;
+		pos = end;
+	}
+
+	void reset() { pos = 0; }
+	char[] data() { return buf[0 .. pos]; }
+}
+
+void save(Entity root, string savedir)
+{
+	safeDelete(savedir);
+	safeMkdir(savedir);
+
 	FastWriter!DiskWriter writer;
 	writer.next.dir = savedir;
 	dump(root, &writer);
@@ -1518,7 +1696,7 @@ bool tryReduction(ref Entity root, Reduction r)
 	if (newRoot is root)
 	{
 		assert(r.type != Reduction.Type.None);
-		writeln(r, " => N/A");
+		stderr.writeln(r, " => N/A");
 		return false;
 	}
 	if (test(newRoot, [r]).success)
@@ -1606,7 +1784,8 @@ Entity applyReductionImpl(Entity origRoot, ref Reduction r)
 			{
 				auto fa = rootAddress.children[i];
 				auto f = edit(fa);
-				f.filename = applyReductionToPath(f.filename, r);
+				if (f.file)
+					f.file.name = applyReductionToPath(f.file.name, r);
 				foreach (j, const word; f.children)
 					if (word.head == r.from)
 						edit(fa.children[j]).head = r.to;
@@ -1660,7 +1839,7 @@ Entity applyReductionImpl(Entity origRoot, ref Reduction r)
 			{
 				if (e.dead)
 					return;
-				if (e.isFile)
+				if (e.file)
 				{
 					// Skip noRemove files, except when they are the target
 					// (in which case they will keep their contents after the reduction).
@@ -1850,10 +2029,10 @@ RoundRobinCache!(ReductionCacheKey, Entity) reductionCache;
 
 Entity applyReduction(Entity origRoot, ref Reduction r)
 {
-	if (lookaheadProcesses.length)
+	if (lookaheadProcessSlots.length)
 	{
 		if (!reductionCache.keys)
-			reductionCache.requireSize(1 + lookaheadProcesses.length);
+			reductionCache.requireSize(1 + lookaheadProcessSlots.length);
 
 		auto cacheKey = ReductionCacheKey(origRoot, r);
 		return reductionCache.get(cacheKey, applyReductionImpl(origRoot, r));
@@ -1894,10 +2073,10 @@ void autoRetry(scope void delegate() fun, lazy const(char)[] operation)
 		}
 		catch (Exception e)
 		{
-			writeln("Error while attempting to " ~ operation ~ ": " ~ e.msg);
+			stderr.writeln("Error while attempting to " ~ operation ~ ": " ~ e.msg);
 			import core.thread;
 			Thread.sleep(dur!"seconds"(1));
-			writeln("Retrying...");
+			stderr.writeln("Retrying...");
 		}
 }
 
@@ -1955,14 +2134,15 @@ void saveResult(Entity root)
 		measure!"resultSave"({safeSave(root, resultDir);});
 }
 
-struct Lookahead
+struct LookaheadSlot
 {
+	bool active;
 	Thread thread;
 	shared Pid pid;
 	string testdir;
 	EntityHash digest;
 }
-Lookahead[] lookaheadProcesses;
+LookaheadSlot[] lookaheadProcessSlots;
 
 TestResult[EntityHash] lookaheadResults;
 
@@ -2002,10 +2182,14 @@ struct TestResult
 		lookahead,
 		diskCache,
 		ramCache,
+		reject,
+		error,
 	}
 	Source source;
 
 	int status;
+	string error;
+
 	string reason()
 	{
 		final switch (source)
@@ -2022,6 +2206,10 @@ struct TestResult
 				return "Test result was cached on disk as " ~ (success ? "success" : "failure");
 			case Source.ramCache:
 				return "Test result was cached in memory as " ~ (success ? "success" : "failure");
+			case Source.reject:
+				return "Test result was rejected by a --reject rule";
+			case Source.error:
+				return "Error: " ~ error;
 		}
 	}
 }
@@ -2031,7 +2219,7 @@ TestResult test(
 	Reduction[] reductions, /// For display purposes only
 )
 {
-	writef("%-(%s, %) => ", reductions); stdout.flush();
+	stderr.writef("%-(%s, %) => ", reductions); stdout.flush();
 
 	EntityHash digest = root.hash;
 
@@ -2041,7 +2229,7 @@ TestResult test(
 		if (cacheResult)
 		{
 			// Note: as far as I can see, a cache hit for a positive reduction is not possible (except, perhaps, for a no-op reduction)
-			writeln(*cacheResult ? "Yes" : "No", " (cached)");
+			stderr.writeln(*cacheResult ? "Yes" : "No", " (cached)");
 			return TestResult(*cacheResult, TestResult.Source.ramCache);
 		}
 		auto result = fallback;
@@ -2062,13 +2250,13 @@ TestResult test(
 			measure!"globalCache"({ found = exists(cacheBase~"0"); });
 			if (found)
 			{
-				writeln("No (disk cache)");
+				stderr.writeln("No (disk cache)");
 				return TestResult(false, TestResult.Source.diskCache);
 			}
 			measure!"globalCache"({ found = exists(cacheBase~"1"); });
 			if (found)
 			{
-				writeln("Yes (disk cache)");
+				stderr.writeln("Yes (disk cache)");
 				return TestResult(true, TestResult.Source.diskCache);
 			}
 			auto result = fallback;
@@ -2085,34 +2273,57 @@ TestResult test(
 		{
 			// Handle existing lookahead jobs
 
-			TestResult reap(ref Lookahead process, int status)
+			Nullable!TestResult reapThread(ref LookaheadSlot slot)
 			{
-				scope(success) process = Lookahead.init;
-				safeDelete(process.testdir);
-				if (process.thread)
-					process.thread.join(/*rethrow:*/true);
-				return lookaheadResults[process.digest] = TestResult(status == 0, TestResult.Source.lookahead, status);
+				try
+				{
+					slot.thread.join(/*rethrow:*/true);
+					slot.thread = null;
+					return typeof(return)();
+				}
+				catch (Exception e)
+				{
+					scope(success) slot = LookaheadSlot.init;
+					safeDelete(slot.testdir);
+					auto result = TestResult(false, TestResult.Source.error);
+					result.error = e.msg;
+					lookaheadResults[slot.digest] = result;
+					return typeof(return)(result);
+				}
+			}
+
+			TestResult reapProcess(ref LookaheadSlot slot, int status)
+			{
+				scope(success) slot = LookaheadSlot.init;
+				safeDelete(slot.testdir);
+				if (slot.thread)
+					reapThread(slot); // should be null
+				return lookaheadResults[slot.digest] = TestResult(status == 0, TestResult.Source.lookahead, status);
 			}
 
-			foreach (ref process; lookaheadProcesses)
-				if (process.thread)
+			foreach (ref slot; lookaheadProcessSlots) // Reap threads
+				if (slot.thread)
 				{
 					debug (DETERMINISTIC_LOOKAHEAD)
-					{
-						process.thread.join(/*rethrow:*/true);
-						process.thread = null;
-					}
+						reapThread(slot);
+					else
+						if (!slot.thread.isRunning)
+							reapThread(slot);
+				}
 
-					auto pid = cast()atomicLoad(process.pid);
+			foreach (ref slot; lookaheadProcessSlots) // Reap processes
+				if (slot.active)
+				{
+					auto pid = cast()atomicLoad(slot.pid);
 					if (pid)
 					{
 						debug (DETERMINISTIC_LOOKAHEAD)
-							reap(process, pid.wait());
+							reapProcess(slot, pid.wait());
 						else
 						{
 							auto waitResult = pid.tryWait();
 							if (waitResult.terminated)
-								reap(process, waitResult.status);
+								reapProcess(slot, waitResult.status);
 						}
 					}
 				}
@@ -2132,8 +2343,8 @@ TestResult test(
 
 			size_t numSteps;
 
-			foreach (ref process; lookaheadProcesses)
-				while (!process.thread && !predictionTree.empty)
+			foreach (ref slot; lookaheadProcessSlots)
+				while (!slot.active && !predictionTree.empty)
 				{
 					auto state = predictionTree.front;
 					predictionTree.removeFront();
@@ -2141,7 +2352,7 @@ TestResult test(
 				retryIter:
 					if (state.iter.done)
 						continue;
-					reductionCache.requireSize(lookaheadProcesses.length + ++numSteps);
+					reductionCache.requireSize(lookaheadProcessSlots.length + ++numSteps);
 					auto reduction = state.iter.front;
 					Entity newRoot;
 					measure!"lookaheadApply"({ newRoot = state.iter.root.applyReduction(reduction); });
@@ -2154,7 +2365,7 @@ TestResult test(
 					auto digest = newRoot.hash;
 
 					double prediction;
-					if (digest in cache || digest in lookaheadResults || lookaheadProcesses[].canFind!(p => p.thread && p.digest == digest))
+					if (digest in cache || digest in lookaheadResults || lookaheadProcessSlots[].canFind!(p => p.thread && p.digest == digest))
 					{
 						if (digest in cache)
 							prediction = cache[digest] ? 1 : 0;
@@ -2166,25 +2377,26 @@ TestResult test(
 					}
 					else
 					{
-						process.digest = digest;
+						slot.active = true;
+						slot.digest = digest;
 
 						static int counter;
-						process.testdir = dirSuffix("lookahead.%d".format(counter++));
+						slot.testdir = dirSuffix("lookahead.%d".format(counter++), Yes.temp);
 
 						// Saving and process creation are expensive.
 						// Don't block the main thread, use a worker thread instead.
-						static void runThread(Entity newRoot, ref Lookahead process, string tester)
+						static void runThread(Entity newRoot, ref LookaheadSlot slot, string tester)
 						{
-							process.thread = new Thread({
-								save(newRoot, process.testdir);
+							slot.thread = new Thread({
+								save(newRoot, slot.testdir);
 
 								auto nul = File(nullFileName, "w+");
-								auto pid = spawnShell(tester, nul, nul, nul, null, Config.none, process.testdir);
-								atomicStore(process.pid, cast(shared)pid);
+								auto pid = spawnShell(tester, nul, nul, nul, null, Config.none, slot.testdir);
+								atomicStore(slot.pid, cast(shared)pid);
 							});
-							process.thread.start();
+							slot.thread.start();
 						}
-						runThread(newRoot, process, tester);
+						runThread(newRoot, slot, tester);
 
 						prediction = state.predictor.predict();
 					}
@@ -2209,26 +2421,35 @@ TestResult test(
 			auto plookaheadResult = digest in lookaheadResults;
 			if (plookaheadResult)
 			{
-				writeln(plookaheadResult.success ? "Yes" : "No", " (lookahead)");
+				stderr.writeln(plookaheadResult.success ? "Yes" : "No", " (lookahead)");
 				return *plookaheadResult;
 			}
 
-			foreach (ref process; lookaheadProcesses)
+			foreach (ref slot; lookaheadProcessSlots)
 			{
-				if (process.thread && process.digest == digest)
+				if (slot.active && slot.digest == digest)
 				{
 					// Current test is already being tested in the background, wait for its result.
 
 					// Join the thread first, to guarantee that there is a pid
-					measure!"lookaheadWaitThread"({ process.thread.join(/*rethrow:*/true); });
-					process.thread = null;
+					if (slot.thread)
+					{
+						auto result = measure!"lookaheadWaitThread"({
+							return reapThread(slot);
+						});
+						if (!result.isNull)
+						{
+							stderr.writefln("%s (lookahead-wait: %s)", result.get().success ? "Yes" : "No", result.get().source);
+							return result.get();
+						}
+					}
 
-					auto pid = cast()atomicLoad(process.pid);
+					auto pid = cast()atomicLoad(slot.pid);
 					int exitCode;
 					measure!"lookaheadWaitProcess"({ exitCode = pid.wait(); });
 
-					auto result = reap(process, exitCode);
-					writeln(result.success ? "Yes" : "No", " (lookahead-wait)");
+					auto result = reapProcess(slot, exitCode);
+					stderr.writeln(result.success ? "Yes" : "No", " (lookahead-wait)");
 					return result;
 				}
 			}
@@ -2237,29 +2458,91 @@ TestResult test(
 		return fallback;
 	}
 
+	TestResult testReject(lazy TestResult fallback)
+	{
+		if (rejectRules.length)
+		{
+			bool defaultReject = !rejectRules.front.remove;
+
+			bool scan(Entity e)
+			{
+				if (e.file)
+				{
+					static MemoryWriter writer;
+					writer.reset();
+					dump(e, &writer);
+
+					static bool[] removeCharBuf;
+					if (removeCharBuf.length < writer.data.length)
+						removeCharBuf.length = writer.data.length;
+					auto removeChar = removeCharBuf[0 .. writer.data.length];
+					removeChar[] = defaultReject;
+
+					foreach (ref rule; rejectRules)
+						if (rule.regexp !is Regex!char.init)
+							foreach (m; writer.data.matchAll(rule.regexp))
+							{
+								auto start = m.hit.ptr - writer.data.ptr;
+								auto end = start + m.hit.length;
+								removeChar[start .. end] = rule.remove;
+							}
+
+					if (removeChar.canFind(true))
+						return true;
+				}
+				else
+					foreach (c; e.children)
+						if (scan(c))
+							return true;
+				return false;
+			}
+
+			if (scan(root))
+			{
+				stderr.writeln("No (rejected)");
+				return TestResult(false, TestResult.Source.reject);
+			}
+		}
+		return fallback;
+	}
+
+	TestResult handleError(lazy TestResult fallback)
+	{
+		try
+			return fallback;
+		catch (Exception e)
+		{
+			auto result = TestResult(false, TestResult.Source.error);
+			result.error = e.msg;
+			stderr.writefln("No (error: %s)", e.msg);
+			return result;
+		}
+	}
+
 	TestResult doTest()
 	{
-		string testdir = dirSuffix("test");
+		string testdir = dirSuffix("test", Yes.temp);
 		measure!"testSave"({save(root, testdir);}); scope(exit) measure!"clean"({safeDelete(testdir);});
 
+		auto nullRead = File(nullFileName, "rb");
 		Pid pid;
 		if (noRedirect)
-			pid = spawnShell(tester, null, Config.none, testdir);
+			pid = spawnShell(tester, nullRead, stdout   , stderr   , null, Config.none, testdir);
 		else
 		{
-			auto nul = File(nullFileName, "w+");
-			pid = spawnShell(tester, nul, nul, nul, null, Config.none, testdir);
+			auto nullWrite = File(nullFileName, "wb");
+			pid = spawnShell(tester, nullRead, nullWrite, nullWrite, null, Config.none, testdir);
 		}
 
 		int status;
 		measure!"test"({status = pid.wait();});
 		auto result = TestResult(status == 0, TestResult.Source.tester, status);
-		writeln(result.success ? "Yes" : "No");
+		stderr.writeln(result.success ? "Yes" : "No");
 		return result;
 	}
 
-	auto result = ramCached(diskCached(lookahead(doTest())));
-	if (trace) saveTrace(root, reductions, dirSuffix("trace"), result.success);
+	auto result = ramCached(diskCached(testReject(lookahead(handleError(doTest())))));
+	if (trace) saveTrace(root, reductions, dirSuffix("trace", No.temp), result.success);
 	return result;
 }
 
@@ -2323,20 +2606,20 @@ void applyNoRemoveRules(Entity root, RemoveRule[] removeRules)
 	// don't remove anything except what's specified by the rule.
 	bool defaultRemove = !removeRules.front.remove;
 
-	auto files = root.isFile ? [root] : root.children;
+	auto files = root.file ? [root] : root.children;
 
 	foreach (f; files)
 	{
-		assert(f.isFile);
+		assert(f.file);
 
 		// Check file name
 		bool removeFile = defaultRemove;
 		foreach (rule; removeRules)
 		{
 			if (
-				(rule.shellGlob && f.filename.globMatch(rule.shellGlob))
+				(rule.shellGlob && f.file.name.globMatch(rule.shellGlob))
 			||
-				(rule.regexp !is Regex!char.init && f.filename.match(rule.regexp))
+				(rule.regexp !is Regex!char.init && f.file.name.match(rule.regexp))
 			)
 				removeFile = rule.remove;
 		}
@@ -2359,6 +2642,7 @@ void applyNoRemoveRules(Entity root, RemoveRule[] removeRules)
 				return true;
 			auto start = s.ptr - f.contents.ptr;
 			auto end = start + s.length;
+			assert(start <= end && end <= f.contents.length, "String is not a slice of the file");
 			return removeChar[start .. end].all;
 		}
 
@@ -2406,10 +2690,10 @@ void loadCoverage(Entity root, string dir)
 {
 	void scanFile(Entity f)
 	{
-		auto fn = buildPath(dir, setExtension(baseName(f.filename), "lst"));
+		auto fn = buildPath(dir, setExtension(baseName(f.file.name), "lst"));
 		if (!exists(fn))
 			return;
-		writeln("Loading coverage file ", fn);
+		stderr.writeln("Loading coverage file ", fn);
 
 		static bool covered(string line)
 		{
@@ -2451,7 +2735,7 @@ void loadCoverage(Entity root, string dir)
 
 	void scanFiles(Entity e)
 	{
-		if (e.isFile)
+		if (e.file)
 			scanFile(e);
 		else
 			foreach (c; e.children)
@@ -2492,7 +2776,8 @@ void convertRefs(Entity root)
 	void convertRef(ref EntityRef r)
 	{
 		assert(r.entity && !r.address);
-		r.address = addresses[r.entity.id];
+		r.address = addresses.get(r.entity.id, null);
+		assert(r.address, "Dependent not in tree");
 		r.entity = null;
 	}
 
@@ -2597,7 +2882,7 @@ void dumpSet(Entity root, string fn)
 			f.write(
 				" ",
 				e.redirect ? "-> " ~ text(findEntityEx(root, e.redirect).entity.id) ~ " " : "",
-				e.isFile ? e.filename ? printableFN(e.filename) ~ " " : null : e.head ? printable(e.head) ~ " " : null,
+				e.file ? e.file.name ? printableFN(e.file.name) ~ " " : null : e.head ? printable(e.head) ~ " " : null,
 				e.tail ? printable(e.tail) ~ " " : null,
 				e.comment ? "/* " ~ e.comment ~ " */ " : null,
 				"]"
@@ -2606,7 +2891,7 @@ void dumpSet(Entity root, string fn)
 		else
 		{
 			f.writeln(e.comment ? " // " ~ e.comment : null);
-			if (e.isFile) f.writeln(prefix, "  ", printableFN(e.filename));
+			if (e.file) f.writeln(prefix, "  ", printableFN(e.file.name));
 			if (e.head) f.writeln(prefix, "  ", printable(e.head));
 			foreach (c; e.children)
 				print(c, depth+1);
@@ -2654,10 +2939,10 @@ void dumpToHtml(Entity root, string fn)
 
 	void dump(Entity e)
 	{
-		if (e.isFile)
+		if (e.file)
 		{
 			buf.put("<h1>");
-			dumpText(e.filename);
+			dumpText(e.file.name);
 			buf.put("</h1><pre>");
 			foreach (c; e.children)
 				dump(c);
@@ -2683,6 +2968,61 @@ EOT");
 	std.file.write(fn, buf.data());
 }
 
+void dumpToJson(Entity root, string fn)
+{
+	import std.json : JSONValue;
+
+	bool[const(Address)*] needLabel;
+
+	void scan(Entity e, const(Address)* addr)
+	{
+		foreach (dependent; e.dependents)
+		{
+			assert(dependent.address);
+			needLabel[dependent.address] = true;
+		}
+		foreach (i, child; e.children)
+			scan(child, addr.child(i));
+	}
+	scan(root, &rootAddress);
+
+	JSONValue toJson(Entity e, const(Address)* addr)
+	{
+		JSONValue[string] o;
+
+		if (e.file)
+			o["filename"] = e.file.name;
+
+		if (e.head.length)
+			o["head"] = e.head;
+		if (e.children.length)
+			o["children"] = e.children.length.iota.map!(i =>
+				toJson(e.children[i], addr.child(i))
+			).array;
+		if (e.tail.length)
+			o["tail"] = e.tail;
+
+		if (e.noRemove)
+			o["noRemove"] = true;
+
+		if (addr in needLabel)
+			o["label"] = e.id.to!string;
+		if (e.dependents.length)
+			o["dependents"] = e.dependents.map!((ref dependent) =>
+				root.findEntity(dependent.address).entity.id.to!string
+			).array;
+
+		return JSONValue(o);
+	}
+
+	auto jsonDoc = JSONValue([
+		"version" : JSONValue(1),
+		"root" : toJson(root, &rootAddress),
+	]);
+
+	std.file.write(fn, jsonDoc.toPrettyString());
+}
+
 // void dumpText(string fn, ref Reduction r = nullReduction)
 // {
 // 	auto f = File(fn, "wt");
@@ -2694,7 +3034,7 @@ version(testsuite)
 shared static this()
 {
 	import core.runtime;
-	"../cov".mkdir.collectException();
-	dmd_coverDestPath("../cov");
+	"../../cov".mkdir.collectException();
+	dmd_coverDestPath("../../cov");
 	dmd_coverSetMerge(true);
 }
diff --git a/DustMite/polyhash.d b/DustMite/polyhash.d
index 5fa9766f62..13ab910d17 100644
--- a/DustMite/polyhash.d
+++ b/DustMite/polyhash.d
@@ -290,8 +290,8 @@ if (is(T : long) && T.sizeof >= 2)
 				asm
 				{
 					"`~x86SignedOpPrefix!T~`mul`~x86SizeOpSuffix!T~` %3"
-					: "=a" low, "=d" high
-					: "a" a, "rm" b;
+					: "=a"(low), "=d"(high)
+					: "a"(a), "rm"(b);
 				}
 			`);
 			return typeof(return)(low, high);
@@ -363,8 +363,8 @@ if (is(T : long) && T.sizeof >= 2 && is(L == LongInt!T))
 				asm
 				{
 					"`~x86SignedOpPrefix!T~`div`~x86SizeOpSuffix!T~` %4"
-					: "=a" quotient, "=d" remainder
-					: "a" low, "d" high, "rm" b;
+					: "=a"(quotient), "=d"(remainder)
+					: "a"(low), "d"(high), "rm"(b);
 				}
 			`);
 			return typeof(return)(quotient, remainder);
diff --git a/DustMite/splitter.d b/DustMite/splitter.d
index ab5da91cc6..be8d5bf372 100644
--- a/DustMite/splitter.d
+++ b/DustMite/splitter.d
@@ -8,14 +8,18 @@ import std.ascii;
 import std.algorithm;
 import std.array;
 import std.conv;
+import std.datetime.systime;
 import std.exception;
 import std.file;
 import std.functional;
 import std.path;
 import std.range;
+import std.stdio : File, stdin;
 import std.string;
 import std.traits;
 import std.stdio : stderr;
+import std.typecons;
+import std.utf : byChar;
 
 import polyhash;
 
@@ -65,8 +69,15 @@ final class Entity
 	Entity[] children;     /// This node's children nodes, e.g. the statements of the statement block.
 	string tail;           /// This node's "tail", e.g. "}" for a statement block.
 
-	string filename, contents;
-	@property bool isFile() { return filename != ""; }
+	string contents;
+
+	struct FileProperties
+	{
+		string name;       /// Relative to the reduction root
+		Nullable!uint mode; /// OS-specific (std.file.getAttributes)
+		Nullable!(SysTime[2]) times; /// Access and modification times
+	}
+	FileProperties* file;  /// If non-null, this node represents a file
 
 	bool isPair;           /// Internal hint for --dump output
 	bool noRemove;         /// Don't try removing this entity (children OK)
@@ -133,22 +144,18 @@ private: // Used during parsing only
 	debug string[] comments;  /// Used to debug the splitter
 }
 
-enum Mode
-{
-	source,
-	words,     /// split identifiers, for obfuscation
-}
-
 enum Splitter
 {
 	files,     /// Load entire files only
 	lines,     /// Split by line ends
+	null_,     /// Split by the \0 (NUL) character
 	words,     /// Split by whitespace
 	D,         /// Parse D source code
 	diff,      /// Unified diffs
 	indent,    /// Indentation (Python, YAML...)
+	lisp,      /// Lisp and similar languages
 }
-immutable string[] splitterNames = [EnumMembers!Splitter].map!(e => e.text().toLower()).array();
+immutable string[] splitterNames = [EnumMembers!Splitter].map!(e => e.text().toLower().chomp("_")).array();
 
 struct ParseRule
 {
@@ -158,7 +165,12 @@ struct ParseRule
 
 struct ParseOptions
 {
-	enum Mode { source, words }
+	enum Mode
+	{
+		source,
+		words,     /// split identifiers, for obfuscation
+		json,
+	}
 
 	bool stripComments;
 	ParseRule[] rules;
@@ -166,21 +178,22 @@ struct ParseOptions
 	uint tabWidth;
 }
 
+version (Posix) {} else
+{
+	// Non-POSIX symlink stubs
+	string readLink(const(char)[]) { throw new Exception("Sorry, symbolic links are only supported on POSIX systems"); }
+	void symlink(const(char)[], const(char)[]) { throw new Exception("Sorry, symbolic links are only supported on POSIX systems"); }
+}
+
 /// Parse the given file/directory.
-/// For files, modifies path to be the base name for .test / .reduced directories.
+/// For files, modifies `path` to be the base name for .test / .reduced directories.
 Entity loadFiles(ref string path, ParseOptions options)
 {
-	if (isFile(path))
-	{
-		auto filePath = path;
-		path = stripExtension(path);
-		return loadFile(filePath.baseName(), filePath, options);
-	}
-	else
+	if (path != "-" && !path.isSymlink && path.exists && path.isDir)
 	{
 		auto set = new Entity();
-		foreach (string entry; dirEntries(path, SpanMode.breadth).array.sort!((a, b) => a.name < b.name))
-			if (isFile(entry))
+		foreach (string entry; dirEntries(path, SpanMode.breadth, /*followSymlink:*/false).array.sort!((a, b) => a.name < b.name))
+			if (isSymlink(entry) || isFile(entry) || isDir(entry))
 			{
 				assert(entry.startsWith(path));
 				auto name = entry[path.length+1..$];
@@ -188,6 +201,16 @@ Entity loadFiles(ref string path, ParseOptions options)
 			}
 		return set;
 	}
+	else
+	{
+		auto realPath = path;
+		string name; // For Entity.filename
+		if (path == "-" || path == "/dev/stdin")
+			name = path = "stdin";
+		else
+			name = realPath.baseName();
+		return loadFile(name, realPath, options);
+	}
 }
 
 enum BIN_SIZE = 2;
@@ -239,61 +262,117 @@ immutable ParseRule[] defaultRules =
 [
 	{ "*.d"    , Splitter.D     },
 	{ "*.di"   , Splitter.D     },
+
 	{ "*.diff" , Splitter.diff  },
 	{ "*.patch", Splitter.diff  },
+
+	{ "*.lisp" , Splitter.lisp  },
+	{ "*.cl"   , Splitter.lisp  },
+	{ "*.lsp"  , Splitter.lisp  },
+	{ "*.el"   , Splitter.lisp  },
+
 	{ "*"      , Splitter.files },
 ];
 
+void[] readFile(File f)
+{
+	import std.range.primitives : put;
+	auto result = appender!(ubyte[]);
+	auto size = f.size;
+	if (size <= uint.max)
+		result.reserve(cast(size_t)size);
+	put(result, f.byChunk(64 * 1024));
+	return result.data;
+}
+
 Entity loadFile(string name, string path, ParseOptions options)
 {
-	stderr.writeln("Loading ", path);
+	auto base = name.baseName();
+	Splitter splitterType = chain(options.rules, defaultRules).find!(rule => base.globMatch(rule.pattern)).front.splitter;
+
+	Nullable!uint mode;
+	if (path != "-")
+	{
+		mode = getLinkAttributes(path);
+		if (attrIsSymlink(mode.get()) || attrIsDir(mode.get()))
+			splitterType = Splitter.files;
+	}
+
+	stderr.writeln("Loading ", path, " [", splitterType, "]");
+	auto contents =
+		attrIsSymlink(mode.get(0)) ? path.readLink() :
+		attrIsDir(mode.get(0)) ? null :
+		cast(string)readFile(path == "-" ? stdin : File(path, "rb"));
+
+	if (options.mode == ParseOptions.Mode.json)
+		return loadJson(contents);
+
 	auto result = new Entity();
-	result.filename = name.replace(`\`, `/`);
-	result.contents = cast(string)read(path);
+	result.file = new Entity.FileProperties;
+	result.file.name = name.replace(dirSeparator, `/`);
+	result.file.mode = mode;
+	if (!mode.isNull() && !attrIsSymlink(mode.get()) && path != "-")
+	{
+		SysTime accessTime, modificationTime;
+		getTimes(path, accessTime, modificationTime);
+		result.file.times = [accessTime, modificationTime];
+	}
+	result.contents = contents;
 
-	auto base = name.baseName();
-	foreach (rule; chain(options.rules, defaultRules))
-		if (base.globMatch(rule.pattern))
-		{
-			final switch (rule.splitter)
+	final switch (splitterType)
+	{
+		case Splitter.files:
+			result.children = [new Entity(result.contents, null, null)];
+			break;
+		case Splitter.lines:
+			result.children = parseToLines(result.contents);
+			break;
+		case Splitter.words:
+			result.children = parseToWords(result.contents);
+			break;
+		case Splitter.null_:
+			result.children = parseToNull(result.contents);
+			break;
+		case Splitter.D:
+			if (result.contents.startsWith("Ddoc"))
+				goto case Splitter.files;
+
+			DSplitter splitter;
+			if (options.stripComments)
+				result.contents = splitter.stripComments(result.contents);
+
+			final switch (options.mode)
 			{
-				case Splitter.files:
-					result.children = [new Entity(result.contents, null, null)];
-					return result;
-				case Splitter.lines:
-					result.children = parseToLines(result.contents);
-					return result;
-				case Splitter.words:
-					result.children = parseToWords(result.contents);
-					return result;
-				case Splitter.D:
-				{
-					if (result.contents.startsWith("Ddoc"))
-						goto case Splitter.files;
+				case ParseOptions.Mode.json:
+					assert(false);
+				case ParseOptions.Mode.source:
+					result.children = splitter.parse(result.contents);
+					break;
+				case ParseOptions.Mode.words:
+					result.children = splitter.parseToWords(result.contents);
+					break;
+			}
+			break;
+		case Splitter.diff:
+			result.children = parseDiff(result.contents);
+			break;
+		case Splitter.indent:
+			result.children = parseIndent(result.contents, options.tabWidth);
+			break;
+		case Splitter.lisp:
+			result.children = parseLisp(result.contents);
+			break;
+	}
 
-					DSplitter splitter;
-					if (options.stripComments)
-						result.contents = splitter.stripComments(result.contents);
+	debug
+	{
+		string resultContents;
+		void walk(Entity[] entities) { foreach (e; entities) { resultContents ~= e.head; walk(e.children); resultContents ~= e.tail; }}
+		walk(result.children);
+		assert(result.contents == resultContents, "Contents mismatch after splitting:\n" ~ resultContents);
+	}
 
-					final switch (options.mode)
-					{
-						case ParseOptions.Mode.source:
-							result.children = splitter.parse(result.contents);
-							return result;
-						case ParseOptions.Mode.words:
-							result.children = splitter.parseToWords(result.contents);
-							return result;
-					}
-				}
-				case Splitter.diff:
-					result.children = parseDiff(result.contents);
-					return result;
-				case Splitter.indent:
-					result.children = parseIndent(result.contents, options.tabWidth);
-					return result;
-			}
-		}
-	assert(false); // default * rule should match everything
+	return result;
 }
 
 // *****************************************************************************************************************************************************************************
@@ -866,6 +945,49 @@ struct DSplitter
 		}
 	}
 
+	// Join together module names. We should not attempt to reduce "import std.stdio" to "import std" (or "import stdio").
+	static void postProcessImports(ref Entity[] entities)
+	{
+		if (entities.length && entities[0].head.strip == "import" && !entities[0].children.length && !entities[0].tail.length)
+			foreach (entity; entities[1 .. $])
+			{
+				static void visit(Entity entity)
+				{
+					static bool isValidModuleName(string s) { return s.byChar.all!(c => isWordChar(c) || isWhite(c) || c == '.'); }
+					static bool canBeMerged(Entity entity)
+					{
+						return
+							isValidModuleName(entity.head) &&
+							entity.children.all!(child => canBeMerged(child)) &&
+							isValidModuleName(entity.tail);
+					}
+
+					if (canBeMerged(entity))
+					{
+						auto root = entity;
+						// Link all ancestors to the root, and in reverse, therefore making them inextricable.
+						void link(Entity entity)
+						{
+							entity.dependents ~= EntityRef(root);
+							// root.dependents ~= EntityRef(entity);
+							foreach (child; entity.children)
+								link(child);
+						}
+						foreach (child; entity.children)
+							link(child);
+					}
+					else
+					{
+						foreach (child; entity.children)
+							visit(child);
+					}
+				}
+
+				foreach (child; entity.children)
+					visit(child);
+			}
+	}
+
 	static void postProcessDependency(ref Entity[] entities)
 	{
 		if (entities.length < 2)
@@ -1014,7 +1136,7 @@ struct DSplitter
 		{
 			if (parenKeywordTokens.canFind(entities[i].token))
 			{
-				auto pparen = firstHead(entities[i+1]);
+				auto pparen = firstNonEmpty(entities[i+1]);
 				if (pparen
 				 && *pparen !is entities[i+1]
 				 && pparen.token == tokenLookup!"(")
@@ -1086,6 +1208,7 @@ struct DSplitter
 				postProcessRecursive(e.children);
 
 		postProcessSimplify(entities);
+		postProcessImports(entities);
 		postProcessTemplates(entities);
 		postProcessDependency(entities);
 		postProcessBlockKeywords(entities);
@@ -1222,16 +1345,18 @@ struct DSplitter
 		postProcessArgs(entities);
 	}
 
-	static Entity* firstHead(ref return Entity e)
+	static Entity* firstNonEmpty(ref return Entity e)
 	{
 		if (e.head.length)
 			return &e;
 		foreach (ref c; e.children)
 		{
-			auto r = firstHead(c);
+			auto r = firstNonEmpty(c);
 			if (r)
 				return r;
 		}
+		if (e.tail.length)
+			return &e;
 		return null;
 	}
 
@@ -1265,6 +1390,7 @@ Entity[] parseSplit(alias fun)(string text)
 
 alias parseToWords = parseSplit!isNotAlphaNum;
 alias parseToLines = parseSplit!isNewline;
+alias parseToNull  = parseSplit!(c => c == '\0');
 
 /// Split s on end~start, preserving end and start on each chunk
 private string[] split2(string end, string start)(string s)
@@ -1295,9 +1421,45 @@ unittest
 	assert(split2!("]", "[")("[foo] [bar]") == ["[foo] [bar]"]);
 }
 
+// From ae.utils.array
+template skipWhile(alias pred)
+{
+	T[] skipWhile(T)(ref T[] source, bool orUntilEnd = false)
+	{
+		enum bool isSlice = is(typeof(pred(source[0..1])));
+		enum bool isElem  = is(typeof(pred(source[0]   )));
+		static assert(isSlice || isElem, "Can't skip " ~ T.stringof ~ " until " ~ pred.stringof);
+		static assert(isSlice != isElem, "Ambiguous types for skipWhile: " ~ T.stringof ~ " and " ~ pred.stringof);
+
+		foreach (i; 0 .. source.length)
+		{
+			bool match;
+			static if (isSlice)
+				match = pred(source[i .. $]);
+			else
+				match = pred(source[i]);
+			if (!match)
+			{
+				auto result = source[0..i];
+				source = source[i .. $];
+				return result;
+			}
+		}
+
+		if (orUntilEnd)
+		{
+			auto result = source;
+			source = null;
+			return result;
+		}
+		else
+			return null;
+	}
+}
+
 Entity[] parseDiff(string s)
 {
-	return s
+	auto entities = s
 		.split2!("\n", "diff ")
 		.map!(
 			(string file)
@@ -1308,54 +1470,387 @@ Entity[] parseDiff(string s)
 		)
 		.array
 	;
+
+	// If a word occurs only in two or more (but not all) hunks,
+	// create dependency nodes which make Dustmite try reducing these
+	// hunks simultaneously.
+	{
+		auto allHunks = entities.map!(entity => entity.children).join;
+		auto hunkWords = allHunks
+			.map!(hunk => hunk.head)
+			.map!((text) {
+				bool[string] words;
+				while (text.length)
+				{
+					alias isWordChar = c => isAlphaNum(c) || c == '_';
+					text.skipWhile!(not!isWordChar)(true);
+					auto word = text.skipWhile!isWordChar(true);
+					if (word.length)
+						words[word] = true;
+				}
+				return words;
+			})
+			.array;
+
+		auto allWords = hunkWords
+			.map!(words => words.byPair)
+			.joiner
+			.assocArray;
+		string[bool[]] sets; // Deduplicated sets of hunks to try to remove at once
+		foreach (word; allWords.byKey)
+		{
+			immutable bool[] hunkHasWord = hunkWords.map!(c => !!(word in c)).array.assumeUnique;
+			auto numHunksWithWord = hunkHasWord.count!(b => b);
+			if (numHunksWithWord > 1 && numHunksWithWord < allHunks.length)
+				sets[hunkHasWord] = word;
+		}
+
+		foreach (set, word; sets)
+		{
+			auto e = new Entity();
+			debug e.comments ~= word;
+			e.dependents ~= allHunks.length.iota
+				.filter!(i => set[i])
+				.map!(i => EntityRef(allHunks[i]))
+				.array;
+			entities ~= e;
+		}
+	}
+
+	return entities;
+}
+
+size_t getIndent(string line, uint tabWidth, size_t lastIndent)
+{
+	size_t indent = 0;
+charLoop:
+	foreach (c; line)
+		switch (c)
+		{
+			case ' ':
+				indent++;
+				break;
+			case '\t':
+				indent += tabWidth;
+				break;
+			case '\r':
+			case '\n':
+				// Treat empty (whitespace-only) lines as belonging to the
+				// immediately higher (most-nested) block.
+				indent = lastIndent;
+				break charLoop;
+			default:
+				break charLoop;
+		}
+	return indent;
 }
 
 Entity[] parseIndent(string s, uint tabWidth)
 {
 	Entity[] root;
-	Entity[]*[] stack;
+	Entity[] stack;
 
 	foreach (line; s.split2!("\n", ""))
 	{
-		size_t indent = 0;
-	charLoop:
-		foreach (c; line)
-			switch (c)
-			{
-				case ' ':
-					indent++;
-					break;
-				case '\t':
-					indent += tabWidth;
-					break;
-				case '\r':
-				case '\n':
-					// Treat empty (whitespace-only) lines as belonging to the
-					// immediately higher (most-nested) block.
-					indent = stack.length;
-					break charLoop;
-				default:
-					break charLoop;
-			}
-
+		auto indent = getIndent(line, tabWidth, stack.length);
 		auto e = new Entity(line);
 		foreach_reverse (i; 0 .. min(indent, stack.length)) // non-inclusively up to indent
 			if (stack[i])
 			{
-				*stack[i] ~= e;
+				stack[i].children ~= e;
 				goto parentFound;
 			}
 		root ~= e;
 	parentFound:
 		stack.length = indent + 1;
-		stack[indent] = &e.children;
+		stack[indent] = new Entity;
+		e.children ~= stack[indent];
 	}
 
 	return root;
 }
 
+Entity[] parseLisp(string s)
+{
+	// leaf head: token (non-whitespace)
+	// leaf tail: whitespace
+	// non-leaf head: "(" and any whitespace
+	// non-leaf tail: ")" and any whitespace
+
+	size_t i;
+
+	size_t last;
+	scope(success) assert(last == s.length, "Incomplete slice");
+	string slice(void delegate() advance)
+	{
+		assert(last == i, "Non-contiguous slices");
+		auto start = i;
+		advance();
+		last = i;
+		return s[start .. i];
+	}
+
+	/// How many characters did `advance` move forward by?
+	size_t countAdvance(void delegate() advance)
+	{
+		auto start = i;
+		advance();
+		return i - start;
+	}
+
+	void advanceWhitespace()
+	{
+		while (i < s.length)
+		{
+			switch (s[i])
+			{
+				case ' ':
+				case '\t':
+				case '\r':
+				case '\n':
+				case '\f':
+				case '\v':
+					i++;
+					continue;
+
+				case ';':
+					i++;
+					while (i < s.length && s[i] != '\n')
+						i++;
+					continue;
+
+				default:
+					return; // stop
+			}
+			assert(false); // unreachable
+		}
+	}
+
+	void advanceToken()
+	{
+		assert(countAdvance(&advanceWhitespace) == 0);
+		assert(i < s.length);
+
+		switch (s[i])
+		{
+			case '(':
+			case ')':
+			case '[':
+			case ']':
+				assert(false);
+			case '"':
+				i++;
+				while (i < s.length)
+				{
+					switch (s[i])
+					{
+						case '"':
+							i++;
+							return; // stop
+
+						case '\\':
+							i++;
+							if (i < s.length)
+								i++;
+							continue;
+
+						default:
+							i++;
+							continue;
+					}
+					assert(false); // unreachable
+				}
+				break;
+			default:
+				while (i < s.length)
+				{
+					switch (s[i])
+					{
+						case ' ':
+						case '\t':
+						case '\r':
+						case '\n':
+						case '\f':
+						case '\v':
+						case ';':
+
+						case '"':
+						case '(':
+						case ')':
+						case '[':
+						case ']':
+							return; // stop
+
+						case '\\':
+							i++;
+							if (i < s.length)
+								i++;
+							continue;
+
+						default:
+							i++;
+							continue;
+					}
+					assert(false); // unreachable
+				}
+				break;
+		}
+	}
+
+	void advanceParen(char paren)
+	{
+		assert(i < s.length && s[i] == paren);
+		i++;
+		advanceWhitespace();
+	}
+
+	Entity[] parse(bool topLevel)
+	{
+		Entity[] result;
+		if (topLevel) // Handle reading whitespace at top-level
+		{
+			auto ws = slice(&advanceWhitespace);
+			if (ws.length)
+				result ~= new Entity(ws);
+		}
+
+		Entity parseParen(char open, char close)
+		{
+			auto entity = new Entity(slice({ advanceParen(open); }));
+			entity.children = parse(false);
+			if (i < s.length)
+				entity.tail = slice({ advanceParen(close); });
+			return entity;
+		}
+
+		while (i < s.length)
+		{
+			switch (s[i])
+			{
+				case '(':
+					result ~= parseParen('(', ')');
+					continue;
+				case '[':
+					result ~= parseParen('[', ']');
+					continue;
+
+				case ')':
+				case ']':
+					if (!topLevel)
+						break;
+					result ~= new Entity(slice({ advanceParen(s[i]); }));
+					continue;
+
+				default:
+					result ~= new Entity(
+						slice(&advanceToken),
+						null,
+						slice(&advanceWhitespace),
+					);
+					continue;
+			}
+			break;
+		}
+		return result;
+	}
+
+	return parse(true);
+}
+
 private:
 
+Entity loadJson(string contents)
+{
+	import std.json : JSONValue, parseJSON;
+
+	auto jsonDoc = parseJSON(contents);
+	enforce(jsonDoc["version"].integer == 1, "Unknown JSON version");
+
+	// Pass 1: calculate the total size of all data.
+	// --no-remove and some optimizations require that entity strings
+	// are arranged in contiguous memory.
+	size_t totalSize;
+	void scanSize(ref JSONValue v)
+	{
+		if (auto p = "head" in v.object)
+			totalSize += p.str.length;
+		if (auto p = "children" in v.object)
+			p.array.each!scanSize();
+		if (auto p = "tail" in v.object)
+			totalSize += p.str.length;
+	}
+	scanSize(jsonDoc["root"]);
+
+	auto buf = new char[totalSize];
+	size_t pos = 0;
+
+	Entity[string] labeledEntities;
+	JSONValue[][Entity] entityDependents;
+
+	// Pass 2: Create the entity tree
+	Entity parse(ref JSONValue v)
+	{
+		auto e = new Entity;
+
+		if (auto p = "filename" in v.object)
+		{
+			e.file = new Entity.FileProperties;
+			e.file.name = p.str.buildNormalizedPath;
+			enforce(e.file.name.length &&
+				!e.file.name.isAbsolute &&
+				!e.file.name.pathSplitter.canFind(`..`),
+				"Invalid filename in JSON file: " ~ p.str);
+		}
+
+		if (auto p = "head" in v.object)
+		{
+			auto end = pos + p.str.length;
+			buf[pos .. end] = p.str;
+			e.head = buf[pos .. end].assumeUnique;
+			pos = end;
+		}
+		if (auto p = "children" in v.object)
+			e.children = p.array.map!parse.array;
+		if (auto p = "tail" in v.object)
+		{
+			auto end = pos + p.str.length;
+			buf[pos .. end] = p.str;
+			e.tail = buf[pos .. end].assumeUnique;
+			pos = end;
+		}
+
+		if (auto p = "noRemove" in v.object)
+			e.noRemove = (){
+				if (*p == JSONValue(true)) return true;
+				if (*p == JSONValue(false)) return false;
+				throw new Exception("noRemove is not a boolean");
+			}();
+
+		if (auto p = "label" in v.object)
+		{
+			enforce(p.str !in labeledEntities, "Duplicate label in JSON file: " ~ p.str);
+			labeledEntities[p.str] = e;
+		}
+		if (auto p = "dependents" in v.object)
+			entityDependents[e] = p.array;
+
+		return e;
+	}
+	auto root = parse(jsonDoc["root"]);
+
+	// Pass 3: Resolve dependents
+	foreach (e, dependents; entityDependents)
+		e.dependents = dependents
+			.map!((ref d) => labeledEntities
+				.get(d.str, null)
+				.enforce("Unknown label in dependents: " ~ d.str)
+				.EntityRef
+			)
+			.array;
+
+	return root;
+}
+
 bool isNewline(char c) { return c == '\r' || c == '\n'; }
 alias isNotAlphaNum = not!isAlphaNum;