xref: /aosp_15_r20/external/libdivsufsort/examples/bwt.c (revision 30b9430b2d8672faf9045aa522d63599a84e8e49)
1*30b9430bSXin Li /*
2*30b9430bSXin Li  * bwt.c for libdivsufsort
3*30b9430bSXin Li  * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
4*30b9430bSXin Li  *
5*30b9430bSXin Li  * Permission is hereby granted, free of charge, to any person
6*30b9430bSXin Li  * obtaining a copy of this software and associated documentation
7*30b9430bSXin Li  * files (the "Software"), to deal in the Software without
8*30b9430bSXin Li  * restriction, including without limitation the rights to use,
9*30b9430bSXin Li  * copy, modify, merge, publish, distribute, sublicense, and/or sell
10*30b9430bSXin Li  * copies of the Software, and to permit persons to whom the
11*30b9430bSXin Li  * Software is furnished to do so, subject to the following
12*30b9430bSXin Li  * conditions:
13*30b9430bSXin Li  *
14*30b9430bSXin Li  * The above copyright notice and this permission notice shall be
15*30b9430bSXin Li  * included in all copies or substantial portions of the Software.
16*30b9430bSXin Li  *
17*30b9430bSXin Li  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18*30b9430bSXin Li  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
19*30b9430bSXin Li  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20*30b9430bSXin Li  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
21*30b9430bSXin Li  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
22*30b9430bSXin Li  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23*30b9430bSXin Li  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
24*30b9430bSXin Li  * OTHER DEALINGS IN THE SOFTWARE.
25*30b9430bSXin Li  */
26*30b9430bSXin Li 
27*30b9430bSXin Li #if HAVE_CONFIG_H
28*30b9430bSXin Li # include "config.h"
29*30b9430bSXin Li #endif
30*30b9430bSXin Li #include <stdio.h>
31*30b9430bSXin Li #if HAVE_STRING_H
32*30b9430bSXin Li # include <string.h>
33*30b9430bSXin Li #endif
34*30b9430bSXin Li #if HAVE_STDLIB_H
35*30b9430bSXin Li # include <stdlib.h>
36*30b9430bSXin Li #endif
37*30b9430bSXin Li #if HAVE_MEMORY_H
38*30b9430bSXin Li # include <memory.h>
39*30b9430bSXin Li #endif
40*30b9430bSXin Li #if HAVE_STDDEF_H
41*30b9430bSXin Li # include <stddef.h>
42*30b9430bSXin Li #endif
43*30b9430bSXin Li #if HAVE_STRINGS_H
44*30b9430bSXin Li # include <strings.h>
45*30b9430bSXin Li #endif
46*30b9430bSXin Li #if HAVE_SYS_TYPES_H
47*30b9430bSXin Li # include <sys/types.h>
48*30b9430bSXin Li #endif
49*30b9430bSXin Li #if HAVE_IO_H && HAVE_FCNTL_H
50*30b9430bSXin Li # include <io.h>
51*30b9430bSXin Li # include <fcntl.h>
52*30b9430bSXin Li #endif
53*30b9430bSXin Li #include <time.h>
54*30b9430bSXin Li #include <divsufsort.h>
55*30b9430bSXin Li #include "lfs.h"
56*30b9430bSXin Li 
57*30b9430bSXin Li 
58*30b9430bSXin Li static
59*30b9430bSXin Li size_t
write_int(FILE * fp,saidx_t n)60*30b9430bSXin Li write_int(FILE *fp, saidx_t n) {
61*30b9430bSXin Li   unsigned char c[4];
62*30b9430bSXin Li   c[0] = (unsigned char)((n >>  0) & 0xff), c[1] = (unsigned char)((n >>  8) & 0xff),
63*30b9430bSXin Li   c[2] = (unsigned char)((n >> 16) & 0xff), c[3] = (unsigned char)((n >> 24) & 0xff);
64*30b9430bSXin Li   return fwrite(c, sizeof(unsigned char), 4, fp);
65*30b9430bSXin Li }
66*30b9430bSXin Li 
67*30b9430bSXin Li static
68*30b9430bSXin Li void
print_help(const char * progname,int status)69*30b9430bSXin Li print_help(const char *progname, int status) {
70*30b9430bSXin Li   fprintf(stderr,
71*30b9430bSXin Li           "bwt, a burrows-wheeler transform program, version %s.\n",
72*30b9430bSXin Li           divsufsort_version());
73*30b9430bSXin Li   fprintf(stderr, "usage: %s [-b num] INFILE OUTFILE\n", progname);
74*30b9430bSXin Li   fprintf(stderr, "  -b num    set block size to num MiB [1..512] (default: 32)\n\n");
75*30b9430bSXin Li   exit(status);
76*30b9430bSXin Li }
77*30b9430bSXin Li 
78*30b9430bSXin Li int
main(int argc,const char * argv[])79*30b9430bSXin Li main(int argc, const char *argv[]) {
80*30b9430bSXin Li   FILE *fp, *ofp;
81*30b9430bSXin Li   const char *fname, *ofname;
82*30b9430bSXin Li   sauchar_t *T;
83*30b9430bSXin Li   saidx_t *SA;
84*30b9430bSXin Li   LFS_OFF_T n;
85*30b9430bSXin Li   size_t m;
86*30b9430bSXin Li   saidx_t pidx;
87*30b9430bSXin Li   clock_t start,finish;
88*30b9430bSXin Li   saint_t i, blocksize = 32, needclose = 3;
89*30b9430bSXin Li 
90*30b9430bSXin Li   /* Check arguments. */
91*30b9430bSXin Li   if((argc == 1) ||
92*30b9430bSXin Li      (strcmp(argv[1], "-h") == 0) ||
93*30b9430bSXin Li      (strcmp(argv[1], "--help") == 0)) { print_help(argv[0], EXIT_SUCCESS); }
94*30b9430bSXin Li   if((argc != 3) && (argc != 5)) { print_help(argv[0], EXIT_FAILURE); }
95*30b9430bSXin Li   i = 1;
96*30b9430bSXin Li   if(argc == 5) {
97*30b9430bSXin Li     if(strcmp(argv[i], "-b") != 0) { print_help(argv[0], EXIT_FAILURE); }
98*30b9430bSXin Li     blocksize = atoi(argv[i + 1]);
99*30b9430bSXin Li     if(blocksize < 0) { blocksize = 1; }
100*30b9430bSXin Li     else if(512 < blocksize) { blocksize = 512; }
101*30b9430bSXin Li     i += 2;
102*30b9430bSXin Li   }
103*30b9430bSXin Li   blocksize <<= 20;
104*30b9430bSXin Li 
105*30b9430bSXin Li   /* Open a file for reading. */
106*30b9430bSXin Li   if(strcmp(argv[i], "-") != 0) {
107*30b9430bSXin Li #if HAVE_FOPEN_S
108*30b9430bSXin Li     if(fopen_s(&fp, fname = argv[i], "rb") != 0) {
109*30b9430bSXin Li #else
110*30b9430bSXin Li     if((fp = LFS_FOPEN(fname = argv[i], "rb")) == NULL) {
111*30b9430bSXin Li #endif
112*30b9430bSXin Li       fprintf(stderr, "%s: Cannot open file `%s': ", argv[0], fname);
113*30b9430bSXin Li       perror(NULL);
114*30b9430bSXin Li       exit(EXIT_FAILURE);
115*30b9430bSXin Li     }
116*30b9430bSXin Li   } else {
117*30b9430bSXin Li #if HAVE__SETMODE && HAVE__FILENO
118*30b9430bSXin Li     if(_setmode(_fileno(stdin), _O_BINARY) == -1) {
119*30b9430bSXin Li       fprintf(stderr, "%s: Cannot set mode: ", argv[0]);
120*30b9430bSXin Li       perror(NULL);
121*30b9430bSXin Li       exit(EXIT_FAILURE);
122*30b9430bSXin Li     }
123*30b9430bSXin Li #endif
124*30b9430bSXin Li     fp = stdin;
125*30b9430bSXin Li     fname = "stdin";
126*30b9430bSXin Li     needclose ^= 1;
127*30b9430bSXin Li   }
128*30b9430bSXin Li   i += 1;
129*30b9430bSXin Li 
130*30b9430bSXin Li   /* Open a file for writing. */
131*30b9430bSXin Li   if(strcmp(argv[i], "-") != 0) {
132*30b9430bSXin Li #if HAVE_FOPEN_S
133*30b9430bSXin Li     if(fopen_s(&ofp, ofname = argv[i], "wb") != 0) {
134*30b9430bSXin Li #else
135*30b9430bSXin Li     if((ofp = LFS_FOPEN(ofname = argv[i], "wb")) == NULL) {
136*30b9430bSXin Li #endif
137*30b9430bSXin Li       fprintf(stderr, "%s: Cannot open file `%s': ", argv[0], ofname);
138*30b9430bSXin Li       perror(NULL);
139*30b9430bSXin Li       exit(EXIT_FAILURE);
140*30b9430bSXin Li     }
141*30b9430bSXin Li   } else {
142*30b9430bSXin Li #if HAVE__SETMODE && HAVE__FILENO
143*30b9430bSXin Li     if(_setmode(_fileno(stdout), _O_BINARY) == -1) {
144*30b9430bSXin Li       fprintf(stderr, "%s: Cannot set mode: ", argv[0]);
145*30b9430bSXin Li       perror(NULL);
146*30b9430bSXin Li       exit(EXIT_FAILURE);
147*30b9430bSXin Li     }
148*30b9430bSXin Li #endif
149*30b9430bSXin Li     ofp = stdout;
150*30b9430bSXin Li     ofname = "stdout";
151*30b9430bSXin Li     needclose ^= 2;
152*30b9430bSXin Li   }
153*30b9430bSXin Li 
154*30b9430bSXin Li   /* Get the file size. */
155*30b9430bSXin Li   if(LFS_FSEEK(fp, 0, SEEK_END) == 0) {
156*30b9430bSXin Li     n = LFS_FTELL(fp);
157*30b9430bSXin Li     rewind(fp);
158*30b9430bSXin Li     if(n < 0) {
159*30b9430bSXin Li       fprintf(stderr, "%s: Cannot ftell `%s': ", argv[0], fname);
160*30b9430bSXin Li       perror(NULL);
161*30b9430bSXin Li       exit(EXIT_FAILURE);
162*30b9430bSXin Li     }
163*30b9430bSXin Li     if(0x20000000L < n) { n = 0x20000000L; }
164*30b9430bSXin Li     if((blocksize == 0) || (n < blocksize)) { blocksize = (saidx_t)n; }
165*30b9430bSXin Li   } else if(blocksize == 0) { blocksize = 32 << 20; }
166*30b9430bSXin Li 
167*30b9430bSXin Li   /* Allocate 5blocksize bytes of memory. */
168*30b9430bSXin Li   T = (sauchar_t *)malloc(blocksize * sizeof(sauchar_t));
169*30b9430bSXin Li   SA = (saidx_t *)malloc(blocksize * sizeof(saidx_t));
170*30b9430bSXin Li   if((T == NULL) || (SA == NULL)) {
171*30b9430bSXin Li     fprintf(stderr, "%s: Cannot allocate memory.\n", argv[0]);
172*30b9430bSXin Li     exit(EXIT_FAILURE);
173*30b9430bSXin Li   }
174*30b9430bSXin Li 
175*30b9430bSXin Li   /* Write the blocksize. */
176*30b9430bSXin Li   if(write_int(ofp, blocksize) != 4) {
177*30b9430bSXin Li     fprintf(stderr, "%s: Cannot write to `%s': ", argv[0], ofname);
178*30b9430bSXin Li     perror(NULL);
179*30b9430bSXin Li     exit(EXIT_FAILURE);
180*30b9430bSXin Li   }
181*30b9430bSXin Li 
182*30b9430bSXin Li   fprintf(stderr, "  BWT (blocksize %" PRIdSAINT_T ") ... ", blocksize);
183*30b9430bSXin Li   start = clock();
184*30b9430bSXin Li   for(n = 0; 0 < (m = fread(T, sizeof(sauchar_t), blocksize, fp)); n += m) {
185*30b9430bSXin Li     /* Burrows-Wheeler Transform. */
186*30b9430bSXin Li     pidx = divbwt(T, T, SA, m);
187*30b9430bSXin Li     if(pidx < 0) {
188*30b9430bSXin Li       fprintf(stderr, "%s (bw_transform): %s.\n",
189*30b9430bSXin Li         argv[0],
190*30b9430bSXin Li         (pidx == -1) ? "Invalid arguments" : "Cannot allocate memory");
191*30b9430bSXin Li       exit(EXIT_FAILURE);
192*30b9430bSXin Li     }
193*30b9430bSXin Li 
194*30b9430bSXin Li     /* Write the bwted data. */
195*30b9430bSXin Li     if((write_int(ofp, pidx) != 4) ||
196*30b9430bSXin Li        (fwrite(T, sizeof(sauchar_t), m, ofp) != m)) {
197*30b9430bSXin Li       fprintf(stderr, "%s: Cannot write to `%s': ", argv[0], ofname);
198*30b9430bSXin Li       perror(NULL);
199*30b9430bSXin Li       exit(EXIT_FAILURE);
200*30b9430bSXin Li     }
201*30b9430bSXin Li   }
202*30b9430bSXin Li   if(ferror(fp)) {
203*30b9430bSXin Li     fprintf(stderr, "%s: Cannot read from `%s': ", argv[0], fname);
204*30b9430bSXin Li     perror(NULL);
205*30b9430bSXin Li     exit(EXIT_FAILURE);
206*30b9430bSXin Li   }
207*30b9430bSXin Li   finish = clock();
208*30b9430bSXin Li   fprintf(stderr, "%" PRIdOFF_T " bytes: %.4f sec\n",
209*30b9430bSXin Li     n, (double)(finish - start) / (double)CLOCKS_PER_SEC);
210*30b9430bSXin Li 
211*30b9430bSXin Li   /* Close files */
212*30b9430bSXin Li   if(needclose & 1) { fclose(fp); }
213*30b9430bSXin Li   if(needclose & 2) { fclose(ofp); }
214*30b9430bSXin Li 
215*30b9430bSXin Li   /* Deallocate memory. */
216*30b9430bSXin Li   free(SA);
217*30b9430bSXin Li   free(T);
218*30b9430bSXin Li 
219*30b9430bSXin Li   return 0;
220*30b9430bSXin Li }
221