-
Notifications
You must be signed in to change notification settings - Fork 0
/
sort.c
3412 lines (3080 loc) · 124 KB
/
sort.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/**************************************************************************
Program: sort.c
Purpose: sort lines of text (with all kinds of options).
Copyright: 1989 Free Software Foundation
Created: December 1988.
Authors: Mike Haertel, Graham Toal.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation; either version 1, or (at your option) any later
version.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc., 675
Mass Ave, Cambridge, MA 02139, USA.
The author may be reached (Email) at the address mike@ai.mit.edu, or (US
mail) as Mike Haertel c/o Free Software Foundation.
*** This source was originally released by Mike Haertel for the GNUish MSDOS
project. He pointed out at the time that it was very much under development.
This version has been hacked about a lot by Graham Toal. If you want the
real GNU sort, please contact Mike Haertel. This file is currently under
development -- i.e. it does not do anything useful. Please do not pass
this around in the guise of an official FSF product. However under the
terms of the GNU Licence I am making what work I have done on this available
to anyone who may find it useful. ***
I have marked the clever bits of code, which are Mike Haertels, with his
initials, just so readers won't blame him for the grotty bits, which are
mine :-) [Or at least any procedures where the code is mostly his and
not hacked about too badly by me -- this seemed easier that putting ownership
comments on every single line :-)]
Graham Toal may be reached at gtoal@edinburgh.ac.uk (via nsfnet-relay.ac.uk
or ukc.ac.uk), or if all routes have failed, at gtoal@vangogh.berkeley.edu
*===*===*===*
***************************************************************************/
/*
* Working notes: Unix sort already uses options y & z -- remove ours.
* Implement unix -y<kmem> option: tells how much phys mem to use. -y0 means
* minim, -y means max(*). This is useful on VM systems as we don't want to
* stray outside our working set. On physical memory systems, we probably
* want to take as much as possible. Unless it is a multitasking phys mem
* system I suppose :-( Also -z<recsz> for longest line length. Not that we
* can do anything about -- we're allowing the longest lines we can anyway...
* What is also sub-optimal is that if sorting > 1 file, we sort the files
* individually & merge the results. Would be better -- if all the files fit
* in store -- to either merge beforehand, _then_ sort, or sort but keep in
* store for the merge rather than write back to a temp file. (*) I think the
* mach kernel has hooks for finding out the current working-set size. If
* that feature isn't present, assume max vm size = max phys size :-(
* Remember to add -u option! (Note optimisation caveats in sort.h...)
*/
/**************************************************************************
We used GNU's sort clone as a central resource in a database indexing
project; Although there were many problems with GNUsort which made life
difficult for us, its core sorting algorithm and features were superb.
To solve the problems we had with GNUsort, we first wrote a sort of our
own. Our sort was complementary to GNU's sort; we fixed the problems
they had and ignored the bits they did well -- with the intention of
eventually marrying the two programs and producing an offspring which did
*everything* well! (Specifically, we kept GNU's 'line' structure so that
someday we could adopt their clever key stuff.)
(premature comment below for when I've finished hacking :-) )
This we have now done. This program is a full-featured sort based on GNUsort,
which we hereby offer to the FSF. [[At the moment, it is only partially
complete, and still awaits handling of large (> avail mem) files. However
the algorithms for this have already been implemented in the first draft,
so shouldn't take long to bolt on.]]
Here are the three major problems with GNUsort which we've solved (any others
are minor and incidental):
1) Its use of malloc is *very* dodgy. Not only does it fragment the heap,
but it make assumptions about how much memory it is going to need which
aren't valid. It frequently runs out of heap space when sorting large
files, especially when they are mostly composed of short lines.
[We don't do bitty little mallocs, and we always have enough space]
(It uses too much ram for things other than text -- we had trouble
sorting 4Mb files on a 16Mb system. The actual sort itself which
needs an extra area the size of the line pointer array is the
biggest culprit)
On a virtual memory system, of course, the fragmentation etc probably
won't be noticed -- but performance will degrade badly as soon as the
memory in use exceeds the working set size.
[We know exactly how much memory we are using (necessary for a
physical memory system) and we never exceed it.]
2) It makes heavy use of temporary files when merging. This causes two
problems; Firstly, on a big file, it can generate hundreds of temporary
files. On our filing system this is disasterous -- we can only have 76
files in a directory; Secondly, it has many files open at once. Our C
runtime library has a ridiculously small limit on the number of open files
allowed.
[We use *one* temporary file, at the expense of sometimes sorting the
input file in place (ie destructively).]
3) It needs twice the size of the input file to be free on the disk. (One
size-of-input for the temporary files, and one for the output file.)
[We offer a 'sort-in-place' option. In fact, sort-in-place is the
primitive default from which the sort-to-output can be manufactured]
What follows is a brief description of our file & store management policy.
<----------------------- start of physical memory (low addresses)
[chunk desc #1]
[chunk desc #2] | These grow this way. One is added for each
[chunk desc #3] | new chunk and all are preserved over chunks.
[chunk desc ...] |
[chunk desc #n] \ /
v
Space from here down is reused for each chunk.
[line from file #1\n]
[line from file #2\n] | Lines of text are added in natural order.
[line from file ...\n] | (newlines are converted to NULs)
[line from file #n\n] \ /
v
[ ... small unused gap ...]
When the lines and descriptors meet, the chunk is ready to be sorted.
[line desc #n]
[line desc #n-1] /^\ Fixed-length descriptors (one per line)
[line desc #3] | are added from the bottom of memory.
[line desc #2] | It is these which are actually sorted.
[line desc #1] |
<----------------------- end of physical memory (high addresses)
We sort large files in chunks -- chunks and associated control info are
defined as being however much we can fit into our large single-malloc store
buffer. Each chunk is sorted in store and the sorted section written out to
file. We keep a record of where all the chunks are in the file in our
growing array of chunk descriptors. (See above).
[It is axiomatic that sorts go faster when the data is entirely in physical
storage; this is a situation where virtual memory doesn't help. This sort
program gropes with a binary chopping malloc to get all physical memory on a
physical memory machine; on a VM machine it must be *told* how much store to
use, and that size must fit in the active working set of pages.]
When building a chunk for sorting, we read in lines sequentially. Each line
is added to our big buffer in increasing order, and a record describing the
line is added to the end of the buffer in decreasing order. We stop adding
text just before the two areas cross. To keep the code simple, we backtrack
the last partial line to the start of that line, so chunks always consist of
whole lines. [[Not quite true; rather than read line-sized units at a time,
we assume fread() is optimised for big chunks, and get as much as will
possibly fit in the buffer. If it is a big file, clearly the text at the
end will have to be thrown away as it is overwritten by line pointers]]
(If a single line is too big for our (gigantic) buffer, we put it in a chunk
of its own and deem it already sorted. (we point a chunk descriptor at it
and skip along the input file until we eventually hit the newline). This
passes the buck to the merge procedure to correctly merge this chunk with
the others. Unfortunately, our merge procedure doesn't cope with it at the
moment either! :-( )
We sort using qsort on the array of line descriptors as a normal sort would.
The comparison routine passed to qsort allows us to sort using all the fancy
options of GNUsort. (We don't actually do so yet as it happens, but we use
the same 'line' format as GNUsort so such options should be easy to add
later).
We write this sorted chunk out to file and continue this process until the
whole file has been sorted. Since our ram area is empty betwen chunks, we
can easily add another chunk descriptor to our array.
Incidentally, as sorting optimisations we do the following:
1) If the lines were already sorted (actually, the line descriptors being
in reverse order!) we don't need to sort. And since we are in fact doing a
destructive sort of the input file in place, we don't even need to write
the data back to file.
2) If the lines were already sorted in reverse (ie the line descriptors
were sorted forwards), we simply reverse all the line descriptors in situ.
3) Otherwise, we need to sort. Some systems have a sort routine which
can degrade badly on partially sorted input, so we use the standard
extremely fast card-shuffling algorithm to randomise the order of the
line descriptors. This is *not* a significant overhead, and guarantees
typical-case performance from any sort, rather than risking a bad
worst-case on some systems.
Having sorted the line descriptors, we loop over them, writing the lines
they point to back out to file. (We could actually have looped over them
backwards in the case of the already-reverse-sorted chunk, but the overhead
of reversing the actual records was pretty low anyway).
Now having a file which is sorted internally in large chunks, we must merge
these chunks to the output file. (Of course, if we have only one chunk
left, our job is done. In fact, we do a quick scan down our list of chunks,
and if we discover that the last line of a chunk is ordered before the first
line of the adjacent chunk, then we can merge those two chunks into a larger
whole. In this way, a large almost-sorted file can collapse into a single
block and no more work need be done...)
A few moments thought will show that it is impossible (or at least *very*
difficult) to merge chunks of a file back to that file in situ. However it
is only when we have sorted a file and ended up with disordered chunks that
we need this extra space. So, we merge the chunks of the file to another
output file, delete the input file, and rename the output file back to the
input. (If we were sorting from an input file to an output file, we can
skip that bit).
The model of merging that we use is slightly warped. We pretend that each
chunk in the file is a file in its own right, and we pretend that we are
feeding each of these files into a 'merge' program which takes two files as
input and produces one file as output; furthermore, we pretend that we have
a binary tree of these 'merge' processes feeding one into the other via unix
pipes!
(something like this... [building this balanced tree without recursion or
mallocs was fun :-)])
<---- file 1
<---- merge |
| <---- file 2
|
|
<---- merge |
| <---- file 3
| |
| |
<---- merge |
| <---- file 4
| |
<---- merge |
|
<---- file 5
In fact, we do nothing of the sort. What we actually do is construct a
lazy-evaluation tree of procedure invocations, whereby we get the next line
to be output by calling the root procedure node and asking for a line. This
request is passed down a calling hierarchy and eventually gets data from a
leaf node out of a chunk of the file. To avoid dreadful thrashing problems,
each of the <N> leaf nodes has been given 1/N of our original big buffer (no
longer needed for sorting...). This almost entirely removes any random disk
seeking overhead. Again, the pipe analogy is not quite valid; what we
actually pass up the tree are pointers to the lines in the cached buffers.
The messages passed down the tree are therefore not "give me a line" but
"make sure the leaf node has a line in store, and pass me a pointer to that
leaf node's line descriptor". The line itself is consumed at the highest
level by directly tweaking the leaf's line descriptor. I've a feeling there's
a silly overhead here somewhere and that I should run back up the tree from
the leaf, invalidating those branches, so that I don't need to do a whole
tree compare of all branches to make it valid again. Must think about it
more when I've sobered up %-}
By this stage, our memory map has changed a little, but we are still using
the technique of allocating from a large fixed buffer.
<----------------------- start of physical memory (low addresses)
[chunk desc #1]
[chunk desc #2] | Chunks were all allocated by end of sort phase.
[chunk desc #3] |
[chunk desc ...] |
[chunk desc #n] -
[pipe #1]
[pipe #2] | These grow this way. A tree of pipes is
[pipe #3] | built up. This array is finished by the
[pipe ...] | time we start merging.
[pipe #n] \ /
v
-
Space from here down is equally divided among all chunks for buffering.
[buffer for chunk #1\n]
[buffer for chunk #2\n]
[buffer for chunk #3\n]
[buffer for chunk ...\n]
[buffer for chunk #n-2\n]
[buffer for chunk #n-1\n]
[buffer for chunk #n\n]
[A little left-over space at the end, as all chunk buffers were created
of exactly the same size. No real reason for this, just happened that way]
<----------------------- end of physical memory (high addresses)
We don't need arrays of line decriptors; we
find each line when we need it, and the
details of the current line are held in
the chunk's buffer.
Limitations: Like GNUsort [& AT&T's sort, it transpires...], we're limited to
sorting files whose longest lines fit in store. If we could hack a comparison
function which worked directly on file (or were using mmap'ed input files) this
wouldn't be a problem. The longest line during merging is sadly much smaller
than the longest line during sorting. Whether this will be a problem remains
to be seen. It may be that we will fail on the merge stage on certain files.
AHA! One of the problems I'd had with this so far was that of losing text
off the end of the buffer when building up the array of line pointers. This
forced me to make the design decision that you could only sort from a file,
not a sequential device (like stdin). The main difficulty I had was where
to store the leftover text while sorting, when waiting for the next run.
I've just realised that the thing to do is to shuffle the text buffer round
so that the leftover text is at the start of the buffer rather than the end --
so you can append to it in the next run. [on reflection, maybe not. Later, dudes.]
I've always wanted a chance to use this rotation algorithm :-) ...
Assume the numbers are the leftovers...
abcdefghijklmnopqrstuvwyz0123456789
First, reverse all the characters: \____________ swap _______________/
9876543210zywvutsrqponmlkjihgfedcba
Then reverse the two sections: \_ swap _/\________ swap _________/
Giving: 0123456789abcdefghijklmnopqrstuvwyz
(There may be other ways to do this, but I like this one :-) )
**************************************************************************/
/*
* Algorithmically, this sort relies on having large contiguous areas of
* memory. If you are compiling on a DOS box, make sure you use what is
* called 'huge model'. Otherwise most of the sorting performance will be
* lost through having merge() do the work which sort() should be doing...
* Though I'm afraid to admit I have used a few practices which might make it
* harder to port to DOS. Sorry folks.
*/
static char version[] = "GNew sort, version 0.1.3";
/* Release.version.edit */
#include <stdio.h>
#include <errno.h>
#include <ctype.h>
#include <string.h>
#include <stddef.h>
#include <limits.h>
#include <stdlib.h>
#include <stdarg.h>
#include <time.h>
#ifdef MEMDEBUG
/*
* Try to explicitly free all memory on exit if you can -- this will help
* mnemosyne find your memory leaks.
*/
#include <mnemosyn.h> /* My hack of Marcus J Ranum's *truly
* excellent* debugging package */
#else
#define CHECKHEAP()
#endif
#ifdef FILEDEBUG
#include "filebug.h" /* for now - later to be <filebug.h> */
#endif
#include "sort.h"
/* Error/debug handling procedures */
int sperror_debug = (0 != 0);
char *sperror_debuglog = ""; /* "" signals 'look up DEBUGLOG
* variable'. May be set to NULL to
* save log space */
#ifdef __riscos
#pragma -v1 /* hint to the compiler to check f/s/printf
* format */
#endif
static void
debugf(const char *fmt,...)
{ /* Debugf does NOT imply a \n on the end of
* its arg */
static char outbuf[256];
static char tims[32];
FILE *logfile;
va_list arg_ptr;
time_t tim;
if (!sperror_debuglog)
return;
va_start(arg_ptr, fmt);
vsprintf(outbuf, fmt, arg_ptr);
va_end(arg_ptr);
time(&tim);
sprintf(tims, "%s", ctime(&tim));
tims[strlen(tims) - 1] = '\0';
fprintf(stderr, "%s %s", tims, outbuf);
if (sperror_debuglog == NULL)
return; /* Logging suppressed by programmer */
if (*sperror_debuglog == '\0')
sperror_debuglog = getenv("DEBUGLOG");
#ifdef __riscos
if (sperror_debuglog == NULL)
sperror_debuglog = "scsi:$.informat.dx2.sperrorlog";
#else
if (sperror_debuglog == NULL)
sperror_debuglog = "sperror.log";
#endif
logfile = fopen(sperror_debuglog, "a");
if (logfile != NULL) {
fprintf(logfile, "%s", outbuf);
fclose(logfile);
}
}
#ifdef __riscos
#pragma -v1 /* hint to the compiler to check f/s/printf
* format */
#endif
static void
sperror(int line, char *file, const char *fmt,...)
{ /* sperror implies a \n after its arg */
static char outbuf[256], s[256];
FILE *logfile;
va_list arg_ptr;
va_start(arg_ptr, fmt);
vsprintf(outbuf, fmt, arg_ptr);
va_end(arg_ptr);
if (sperror_debug) {
sprintf(s, "%s, line %d: %s\n", file, line, outbuf);
} else {
sprintf(s, "%s\n", outbuf);
}
fprintf(stderr, "%s", s);
if (sperror_debuglog == NULL)
return; /* Logging suppressed by programmer */
if (*sperror_debuglog == '\0')
sperror_debuglog = getenv("DEBUGLOG");
if (sperror_debuglog == NULL)
sperror_debuglog = "scsi:$.informat.dx2.sperrorlog";
logfile = fopen(sperror_debuglog, "a");
if (logfile != NULL) {
fprintf(logfile, "%s", s);
fclose(logfile);
}
}
/* Initialize the character class tables. */
static void
inittables(void)
{
int i;
/*
* Note - this could benefit from judicious use of setlocale() --
* although it is reasonably 8-bit-clean, the tolower() etc calls below
* unfortunately impose 7-bit character sets by default. (Best of all
* would be if the locale could be passed in as an optional parameter, or
* via an environmental variable.)
*/
for (i = 0; i < (UCHAR_MAX + 1); ++i) {
if (isspace(i))
blanks[i] = 1;
if (isdigit(i))
digits[i] = 1;
if (!isprint(i))
nonprinting[i] = 1;
if (!isalnum(i) && !isspace(i))
nondictionary[i] = 1;
if (isupper(i))
fold_tolower[i] = tolower(i);
else
fold_tolower[i] = i;
}
}
/*
* Useful little (!?) debug utility to be inserted at dubious points in the
* code when mucking about with anything which changes the shape of the large
* buffer.
*/
static void
display_mem_usage(char *where)
{
int i;
/*********************************************************************
sort phase:
[pipechunk 0]
[pipechunk ...]
[pipechunk n-1]
[text ]
[ .free. ]
[line n-1]
[line ...]
[line 0]
merge phase:
[pipechunk 0]
[pipechunk ...]
[pipechunk n-1]
[pipe 0]
[pipe ...]
[pipe m-1]
[text 0]
[text ...]
[text n-1]
[ .free. ]
*********************************************************************/
debugf("---------------------------------------- %s\n", where);
debugf("&x->pipe[x->next_free_pipe = %d] = %p\n",
x->next_free_pipe, &x->pipe[x->next_free_pipe]);
debugf("x->cur_chunk->text = %p\n", x->cur_chunk->text);
debugf("x->cur_chunk->text_accepted = %p\n", x->cur_chunk->text_accepted);
debugf("x->cur_chunk->text_lim = %p\n", x->cur_chunk->text_lim);
debugf("x->buffer_lim = %p\n", x->buffer_lim);
debugf("&x->line[x->next_used_line = %d] = %p\n", /* Add limit of line's
* text? */
x->next_used_line, &x->line[x->next_used_line]);
debugf("&x->line[x->line_base = %d] = %p\n",
x->line_base, &x->line[x->line_base]);
if (x->next_free_pipe == 0) {
debugf("%p [--no pipes--]\n", &x->pipe[0]);
} else {
for (i = 0; i < x->next_free_pipe; i++) {
debugf("%p [pipe %d] %p\n", &x->pipe[i], i, &x->pipe[i + 1]);
}
}
debugf("%p [text lines... (%d bytes) -> ] %p\n",
x->cur_chunk->text, x->cur_chunk->text_accepted - x->cur_chunk->text, x->cur_chunk->text_accepted);
debugf("%p [raw text... (%d bytes) -> ] %p\n",
x->cur_chunk->text_accepted, x->cur_chunk->text_lim - x->cur_chunk->text_accepted, x->cur_chunk->text_lim);
if ((x->buffer_lim - x->cur_chunk->text_lim) < 1024) {
debugf("%p [--small gap--] %p\n", x->cur_chunk->text_lim, x->buffer_lim);
} else {
debugf("%p [--large gap--] %p\n", x->cur_chunk->text_lim, x->buffer_lim);
}
if (x->next_used_line == x->line_base) {
debugf(" [--no lines--] %p\n", &x->line[x->line_base]);
} else {
if (x->line_base - x->next_used_line > 1) {
/* First line */
debugf("%p [line 0 (%d) -> %p \"%s\"]\n",
&x->line[x->next_used_line], x->next_used_line,
x->line[x->next_used_line + 1].text,
x->line[x->next_used_line + 1].text);
}
/* Last or only line */
debugf("%p [line %d (%d) -> %p \"%s\"] %p\n",
&x->line[x->line_base - 1],
x->line_base - 1 - x->next_used_line, x->line_base - 1,
x->line[x->line_base - 1].text, x->line[x->line_base - 1].text,
&x->line[x->line_base]);
}
debugf("%p [--end of memory--]\n", &x->line[x->line_base]);
}
/*
* This isn't a general-purpose allocator: we rely on it returning objects
* sequentially.
*/
static PIPE *
new_pipe(void)
{
PIPE *p;
p = &x->pipe[x->next_free_pipe];
memset(p, '\0', sizeof(PIPE));
x->next_free_pipe += 1;
return (p);
}
/*
* Get another chunk descriptor. Only called when big buffer is empty.
*/
PIPE *
make_pipechunk(void)
{
PIPE *p;
p = new_pipe();
p->type = DATA;
return (p);
}
/*
* Get another pipe operator descriptor. Only called after all chunks have
* been sorted, and before merging.
*/
PIPE *
make_pipeoper(void)
{
PIPE *p;
p = new_pipe();
p->type = MERGE;
return (p);
}
static PIPE *
merge_pipes(PIPE * p, PIPE * left, PIPE * right)
{
p->node.operand.left = left;
p->node.operand.right = right;
return (p);
}
static void
display_chunk(CHUNK * c, char *why)
{
debugf("chunk @ %p: --- %s\n", c, why);
debugf("c->buf = %p\n", c->buf);
debugf("c->buf_lim = %p\n", c->buf_lim);
debugf("c->line = %p\n", &c->line); /* Dunno whether to delve into this */
debugf("c->stored_chunk_filename = %s\n", (c->stored_chunk_filename == NULL ? "(null)" : c->stored_chunk_filename));
debugf("c->stored_chunk_file = %p\n", c->stored_chunk_file);
debugf("c->file_start_pos = %ld\n", c->file_start_pos);
debugf("c->file_chunk_length = %ld\n", c->file_chunk_length);
debugf("c->text = %p\n", c->text);
debugf("c->text_accepted = %p\n", c->text_accepted);
debugf("c->text_lim = %p\n", c->text_lim);
}
static void
replenish_chunk(CHUNK * chunk)
{
size_t bytes_got, valid_length, relocation_offset, free_space;
if (chunk->file_chunk_length == 0) {
/* debugf("Chunk done.\n"); */
return;
}
/*
* (this shouldn't be called with a valid line up the spout, but we'll
* assume it can since we might want to call it in this way at some point
* in the future.)
*/
/* Move any text at chunk->text back up to chunk->buf */
valid_length = chunk->text_lim - chunk->text;
/* display_chunk(chunk, "before replenishment"); */
/* debugf("on entry to replenish, text_lim is %p\n", chunk->text_lim); */
/*
* debugf("text is %p buf is %p text_accepted is %p\n", chunk->text,
* chunk->buf, chunk->text_accepted);
*/
relocation_offset = chunk->text - chunk->buf;
if ((valid_length != 0) && (relocation_offset != 0)) {
/*
* No need to move anything if buffer fully empty, said Tom
* oxymoronically
*/
/*
* debugf("memove(to: %p from: %p size: %8x)\n", chunk->buf,
* chunk->text, valid_length);
*/
memmove(chunk->buf, chunk->text, valid_length); /* may overlap, but
* memmove is safe */
}
/*
* relocating chunk->line is so tedious that it's easier to throw it away
* and always find the line again, I've decided.
*/
/* set chunk->text_lim */
/*
* debugf("subtracting %8x from text lim at %p giving %p\n",
* relocation_offset, chunk->text_lim,
* chunk->text_lim-relocation_offset);
*/
chunk->text_lim -= relocation_offset;
/* Read as much as possible to text_lim short of buf_lim */
free_space = chunk->buf_lim - chunk->text_lim;
if (free_space == 0) {
/* debugf("chunk is full already\n"); */
return;
}
if (chunk->file_chunk_length < free_space) {
/* Rest of file will fit. */
free_space = (size_t) chunk->file_chunk_length;
}
if (sort_verbose) debugf("getchunk:\n");
fseek(chunk->stored_chunk_file, chunk->file_start_pos, SEEK_SET);
/* debugf("reading: to: %p size: %8x\n", chunk->text_lim, free_space); */
bytes_got = fread(chunk->text_lim, 1, free_space, chunk->stored_chunk_file);
if (sort_verbose) debugf("getchunk: %d bytes\n", bytes_got);
if (bytes_got == -1) {
sperror(__LINE__, __FILE__, "sort: failed on reading back chunk from %s",
chunk->stored_chunk_filename);
}
/*
* debugf("adding %8x to text lim at %p giving %p\n", bytes_got,
* chunk->text_lim, chunk->text_lim+bytes_got);
*/
chunk->text_lim += bytes_got;
/* Decrement start_pos and chunk_length appropriately */
chunk->file_start_pos += bytes_got;
chunk->file_chunk_length -= bytes_got;
/* Caller will find first line if chunk->text_accepted isn't valid */
chunk->text = chunk->text_accepted = chunk->buf;
/* display_chunk(chunk, "after replenishment"); */
}
static void
preload_chunk_cache(int next_free_chunk)
{
unsigned char *free_ptr;
size_t available_space, block_size;
int i;
/*
* Divide up the large buffer into equal sections, and preload as much as
* possible of each chunk into its respective buffer. Initialise the
* chunks internal variables, such as where to seek to in the file to get
* more data, and how many bytes remain to be read.
*/
available_space = ((char *) &x->line[x->next_used_line]
- (char *) &x->pipe[x->next_free_pipe]) / next_free_chunk;
block_size = available_space / next_free_chunk;
free_ptr = (unsigned char *) &x->pipe[x->next_free_pipe];
for (i = 0; i < next_free_chunk; i++) {
x->pipe[i].node.chunk.chunk.buf = free_ptr;
x->pipe[i].node.chunk.chunk.text = free_ptr;
x->pipe[i].node.chunk.chunk.text_accepted = free_ptr;
x->pipe[i].node.chunk.chunk.text_lim = free_ptr;
/*
* NOTE: if the size of the chunk is less than the space for this
* buffer, could allocate to it only that much space as is needed.
* However if one chunk is too small probably most of them will be,
* so I can't see any advantage in saving the Ram space.
* --- yes I can! Allocate the chunks in reverse, so the last (smallest)
* comes first. Bump up free_ptr by min(actual_size, tot_space/slots_left)
* and perform the bucket calculation anew for each chunk, so the
* slot can grow larger each time, using the space leftover space.
* The goal is to fit the whole file in ram if the file size is
* less than the buffer size, even if only by a few bytes.
*/
x->pipe[i].node.chunk.chunk.buf_lim = (free_ptr += available_space);
replenish_chunk(&x->pipe[i].node.chunk.chunk);
}
/* debugf("all chunks preloaded\n"); */
}
/* Build a balanced tree bottom-up from all the chunks of the file. */
static PIPE *
merge_chunks(void)
{
int first_pipe, this_pipe, last_pipe;
/*
* Note: one optimisation I've been thinking about but haven't done yet
* -- I should check the consecutive chunks for ordering -- if the last
* line of one chunk is ordered before the first line of the next chunk,
* then the two chunks can be concatenated back into one chunk. Assuming
* of course that they are stored sequentially in the same chunk file...
*/
/*
* One of the problems with GNU sort was that it tried to open as many
* files as possible at once. This is a very difficult thing to do, as
* the number isn't knowable at compile time. In my first hack with GNU
* sort, I made that parameter dynamic, by opening lots of files till one
* failed, deleting them all, and reducing the number I found by a few
* for emergencies. However, this sort attempts to keep the number of
* open files down to a minimum. Most of the chunks to be sorted will be
* held within the one chunk-file anyway, but even if there are chunks in
* multiple files to be merged, I'd rather do it by opening the file
* every time I need a bufferful, and closing it immediately afterwards.
* I'm assuming all chunk files are seekable. (Which they will be). The
* chunks will be large enough (many 10's of Ks; pref many 100's of Ks or
* even Megs) that the overhead of fopen+fseek+fclose will be minimal
* relative to reading the data. [We took this approach in a database
* project recently -- worked a treat. In that project we built a fake
* stdio layer which did all the opening & closing transparently, using
* an LRU to cache <n> open files out of <m> virtually open files]
*/
last_pipe = x->next_free_pipe;
this_pipe = first_pipe = 0;
/*
* Then build the pipes into trees. Sets last_pipe to last of root chunk
* pipes
*/
for (;;) {
/* Built a tree with all pipes merging into one...? */
if (first_pipe + 1 == last_pipe) {
/*
* debugf("We have the final pipe! first_pipe = %d last_pipe =
* %d\n", first_pipe, last_pipe);
*/
break;
}
for (;;) {
PIPE *p;
/*
* debugf("this_pipe = %d last_pipe = %d\n", this_pipe,
* last_pipe);
*/
if (this_pipe + 1 == last_pipe) {
/*
* debugf("odd number of pipes -- %d gets a bye\n",
* this_pipe);
*/
break;
} else if (this_pipe + 1 > last_pipe) {
break;
}
/*
* debugf("Merging pipes %d and %d -> %d\n", this_pipe, this_pipe
* + 1, x->next_free_pipe);
*/
p = make_pipeoper();
merge_pipes(p, &x->pipe[this_pipe], &x->pipe[this_pipe + 1]);
this_pipe += 2;
}
if (this_pipe + 1 >= last_pipe) {
/* odd no. of pipes - so give this one a 'bye' to the next level */
last_pipe = this_pipe;
}
this_pipe = first_pipe = last_pipe;
last_pipe = x->next_free_pipe;
}
/*
* The text buffers for pipe[0..x->next_free_pipe-1] are initialised in
* preload_chunk_cache()
*/
/*
* Now we've done all the mallocking we're going to do, use all that is
* left over...
*/
/* Divvie-up the remaining space as buffers for each chunk */
/* suck the data out of this_pipe ... */
/* debugf("Root pipe is %d\n", first_pipe); */
return (&x->pipe[first_pipe]);
}
static int
preload_line(CHUNK * c)
{
unsigned char *ptr, *start_of_this_line;
struct keyfield *key;
/*
* Update c->text and c->text_accepted from data in chunk. If can't find
* a whole line, replenish that chunk first.
*/
if ((c->text == c->text_lim) && (c->file_chunk_length == 0)) {
/* End of chunk. All done. Rely on comparison to return the other guy */
/* debugf("sorry - all gone\n"); */
return (0 != 0);
}
if (c->text == c->text_accepted) {
/* debugf("getting a line from chunk %p\n", c); */
if ((ptr = memchr(c->text, '\n', c->text_lim - c->text)) == NULL) {
/* Chunk needs replenishment */
replenish_chunk(c);
if ((ptr = memchr(c->text, '\n', c->text_lim - c->text)) == NULL) {
sperror(__LINE__, __FILE__, "probably end of chunk?");
display_chunk(c, "couldn't find \\n even after replenishment");
exit(EXIT_FAILURE);
}
}
/* This code pinched from findlines() */
*ptr++ = '\0';
c->text_accepted = ptr;
/* debugf("got line %s\n", c->text); */
/* Oops - almost forgot to findline() on it... */
start_of_this_line = c->line.text = c->text;
c->line.length = c->text_accepted - c->text - 1; /* -1 for lost \n */
/* Precompute the position of the first key for efficiency. */
key = keyhead.next;
if (key != NULL) {
if (key->eword >= 0) {
c->line.keylim = limfield(&x->line[x->next_used_line], key);
} else {
c->line.keylim = c->text_accepted - 1;
}
if (key->sword >= 0) {
c->line.keybeg = begfield(&c->line, key);
} else {
if (key->skipsblanks) {
while (blanks[UCHAR(*start_of_this_line)])
++start_of_this_line;
}
c->line.keybeg = start_of_this_line;
}
}
} else {
/* debugf("already have a line: %s\n", c->text); */
}
return (0 == 0);
}
int
preload_next_line(PIPE * p, CHUNK ** chunkp, int (*compare) (const void *l, const void *r))
{
#define c (*chunkp)
CHUNK *left, *right;
int rc;
if (p->type == MERGE) {
preload_next_line(p->node.operand.left, &left, compare);
preload_next_line(p->node.operand.right, &right, compare);
/* Compare, and set chunk to the first one */
/* The NULL tests below should be enough but I'm paranoid. */
if ((left == NULL) || (left->text == left->text_accepted)) {
/* Left side all done */
/* debugf("left done. using up right.\n"); */
if ((right == NULL) || (right->text == right->text_accepted)) {
/* Both pipes empty. So we're empty too. */
c = NULL;
return (0 != 0);
}
c = right;
} else if ((right == NULL) || (right->text == right->text_accepted)) {
/* Right side all done */
/* debugf("right done. using up left.\n"); */
c = left;
} else {
/* both sides valid */
/*
* fprintf(stderr, "Comparing %s and %s\n", left->line.text,
* right->line.text);
*/
if ((rc = compare(&left->line, &right->line)) < 0) { // WOOT WOOT! An old bug finally fixed!
c = left;
} else if (rc > 0) {
c = right;
} else {
/* They were the same. Throw one away if sorting '-u' */
c = left;
}
}
} else if (p->type == DATA) {
c = &p->node.chunk.chunk;
if (!preload_line(c)) {
c = NULL; /* I suppose. See what happens. */
}
/* Set chunk to this one */
} else {
sperror(__LINE__, __FILE__, "internal error");
}
return (0 == 0);
#undef c
}
static void
consume_pipes(PIPE * root, FILE * outf)
{
long outsize;
long written;
CHUNK *chunk;
/*
* Having built up a tree of blocks to be merged, we remove the next line
* from the head of the tree and write it out. The PIPE structure passes
* all the requests down the appropriate branches...
*/
/* Get line through tree of pipes */
if (sort_verbose) debugf("merging:\n");
while (preload_next_line(root, &chunk, compare)) {
/* Write to output file */
/* fwrite(chunk->line.text, 1, chunk->line.length, stdout); */
/* fputc('\n', stdout); */
if (
((written = fwrite(chunk->line.text, 1, chunk->line.length, outf))
!= chunk->line.length)
||
(fputc('\n', outf) == EOF)
) {
sperror(__LINE__, __FILE__,
"Disk full on output? - tried to write %d - managed %ld\n",
chunk->line.length, written);
exit(EXIT_FAILURE);
}
/* Consume the line */
chunk->text = chunk->text_accepted;