/*
* Splits an HTML file into several files and updates
* hypertext links accordingly.
*
* A special option --djtitles[=document-name] is for
* DJ Delorie's server, where all documents have their
* HTML headers and trailers produced by Perl scripts
* called, respectively, `header' and `trailer'. The
* optional argument `document-name', if supplied, gets
* prepended to the section/chapter name when computing
* the header that is passed to the `header' script. So,
* for example, if you say --djtitles='DJGPP FAQ', the
* produced header for the section "Foo Bar" will look
* like this:
*
*
*
* Restrictions:
*
* 1. The various markups used as markers MUST be
* found verbatim, i.e. without excess whitespace
* and *not* split between 2 adjacent lines.
* 2. Currently only supports splitting the file one
* node per file; you cannot split the file by
* chapters.
* 3. The string which signals the beginning of a new
* node is hard-wired into the program and cannot
* be changed without recompiling.
*
*
* Author: Eli Zaretskii
*
* Version: 1.3
*
* Last updated: 23 October, 1999
*
* ----------------------------------------------------------
*
* You can do whatever you like with this program, except:
* (1) preventing other people (including the author) do
* whatever they like; and (2) removing the author and the
* version info above.
*
* ----------------------------------------------------------
*
*/
#include
#include
#include
#include
#include
#include
#include
#include
#ifdef __DJGPP__
#include
/* Make so our start-up code is minimal: disable filename
globbing, and don't load the environment file. */
#include
char ** __crt0_glob_function(char *arg) { return (char **)0; }
void __crt0_load_environment_file(char *app_name) {}
#else /* not __DJGPP__ */
/* Some Unix boxes don't have functon prototypes on the header files.
-Wall will complain about this, so here are the prototypes: */
void perror (const char *);
int fprintf(FILE *, const char *, ...);
/* Non-DJGPP libraries might not have these two functions. */
int
strnicmp(const char *s1, const char *s2, size_t n)
{
if (n == 0)
return 0;
do {
if (tolower(*s1) != tolower(*s2++))
return (int)tolower(*s1) - (int)tolower(*--s2);
if (*s1++ == 0)
break;
} while (--n != 0);
return 0;
}
#include
#include
long
filelength(int fd)
{
struct stat stbuf;
if (fstat(fd, &stbuf) == 0)
return stbuf.st_size;
return -1;
}
#endif /* not __DJGPP__ */
#ifndef O_BINARY
#define O_BINARY 0
#endif
static char node_start_text[] = "Node:text, point, marker->len) == 0;
}
/* Record a position where we'll split the file, bump point. */
struct split_pos {
size_t pos;
int fileno;
char *section_num;
char *title;
};
static struct split_pos *split_pos_table; /* table of split positions */
static int split_pos_table_size; /* the size of the table */
static int split_pos_idx; /* index of next free slot */
size_t
remember_split_pos(size_t pos, int fileno)
{
if (split_pos_idx >= split_pos_table_size)
{
if (split_pos_table)
split_pos_table =
realloc(split_pos_table,
(split_pos_table_size *= 2)*sizeof(struct split_pos));
else
{
split_pos_table_size = 100;
split_pos_table =
malloc(split_pos_table_size*sizeof(struct split_pos));
}
if (split_pos_table == (struct split_pos *)0)
{
errno = ENOMEM;
perror("split_pos table");
exit(2);
}
memset(split_pos_table + split_pos_idx, 0,
(split_pos_table_size - split_pos_idx)*sizeof(struct split_pos));
}
if (split_pos_idx > 0 && !split_pos_table[split_pos_idx].title)
{
/* There was no heading tag in this section, or the heading
didn't include a section number. Use the subfile number as a
fall-back. */
if (split_pos_table[split_pos_idx].section_num)
split_pos_table[split_pos_idx].title =
split_pos_table[split_pos_idx].section_num;
split_pos_table[split_pos_idx].section_num = malloc(80);
if (split_pos_table[split_pos_idx].section_num == NULL)
{
errno = ENOMEM;
perror("section_num slot in split_pos table");
exit(2);
}
/* We treat all unnumbered sections as if they were part of
a non-existent chapter 0. This makes sure the file names
we generate for the unnumbered sections never clash with
those for numbered sections. */
sprintf(split_pos_table[split_pos_idx].section_num, "0_%d", fileno);
}
split_pos_table[split_pos_idx].pos = pos;
split_pos_table[split_pos_idx++].fileno = fileno;
return split_marker.len;
}
/* Fill the section number and title slots of a split position. */
size_t
remember_heading(char *p, int fileno)
{
char *orig = p, *s;
size_t title_offset;
char header_level;
if (split_pos_idx >= split_pos_table_size)
{
if (split_pos_table)
split_pos_table =
realloc(split_pos_table,
(split_pos_table_size *= 2)*sizeof(struct split_pos));
else
{
split_pos_table_size = 100;
split_pos_table =
malloc(split_pos_table_size*sizeof(struct split_pos));
}
if (split_pos_table == (struct split_pos *)0)
{
errno = ENOMEM;
perror("split_pos table");
exit(2);
}
memset(split_pos_table + split_pos_idx, 0,
(split_pos_table_size - split_pos_idx)*sizeof(struct split_pos));
}
p += heading_marker.len;
if (split_pos_idx == 0
|| split_pos_table[split_pos_idx].section_num
|| !isdigit(*p) || p[1] != '>') /* could be
or some such */
return p - orig; /* just step over it */
split_pos_table[split_pos_idx].fileno = fileno;
header_level = *p;
p += 2;
for (s = p; *p && (isdigit(*p) || *p == '.'); p++)
;
title_offset = p - s;
while (*p && (*p != '<' || strnicmp(p, "= dest_pos_table_size)
{
if (dest_pos_table)
dest_pos_table =
realloc(dest_pos_table,
(dest_pos_table_size *= 2)*sizeof(struct dest_pos));
else
{
dest_pos_table_size = 100;
dest_pos_table =
malloc(dest_pos_table_size*sizeof(struct dest_pos));
}
if (dest_pos_table == (struct dest_pos *)0)
{
errno = ENOMEM;
perror("dest_pos table");
exit(2);
}
}
p += dest_marker.len;
name_start = p;
while (*p !='"')
p++;
dest_pos_table[dest_pos_idx].fileno = fileno;
dest_pos_table[dest_pos_idx].name = malloc(p - name_start + 1);
if (dest_pos_table[dest_pos_idx].name == (char *)0)
{
errno = ENOMEM;
perror("name in dest_pos table");
exit(2);
}
strncpy(dest_pos_table[dest_pos_idx].name, name_start, p - name_start);
dest_pos_table[dest_pos_idx++].name[p - name_start] = '\0';
return p - save_point;
}
/* Skip ``?[\\^`|~=") == NULL)
{
/* ARG has no metacharacters. Just copy ARG to QUOTED. */
len = strlen(arg);
if (quoted)
strcpy(quoted, arg);
}
else
{
char c;
if (quoted)
quoted[len] = '\'';
len++;
for (c = *arg++; c; c = *arg++)
{
if (c == '\'')
{
if (quoted)
{
quoted[len] = '\'';
quoted[len + 1] = '\\';
quoted[len + 2] = '\'';
}
len += 3;
}
/* Replace any newlines with spaces, since we only use this
function for printing a title via the `header' script. */
if (c == '\r' || c == '\n')
{
/* If we have a DOS-style CR-LF pair, replace
them both with a single space. */
if (len > 1 && c == '\r' && *arg == '\n')
arg++;
c = ' ';
}
if (quoted)
quoted[len] = c;
len++;
}
if (quoted)
{
quoted[len] = '\'';
quoted[len + 1] = '\0';
}
len++;
}
return len;
}
/* Strip STRING of HTML tags that can appear in a section
or a chapter title. */
static void
remove_markup_tags (char *string)
{
static struct markup {
const char *tag; /* the tag */
size_t tag_len; /* its length */
int replace; /* with what to replace; -1 means just remove it */
} style_markup[] = {
{"em", 2, '*'},
{"strong", 6, '*'},
{"code", 4, '\''},
{"samp", 4, '\''},
{"kbd", 3, -1},
{"var", 3, '\''},
{"dfn", 3, '"'},
{"cite", 4, '"'},
{"b", 1, '*'},
{"i", 1, '*'},
{"u", 1, '*'},
{"tt", 2, -1}
};
const int n_markups = sizeof(style_markup)/sizeof(style_markup[0]);
register char *s = string, *d = string;
/* Copy the string into itself, removing markup tags as we go and
replacing them with replacement characters if needed. Since the
tags are longer than the replacements, we always have enough room. */
for ( ; *s; s++, d++)
{
*d = *s; /* by default, just copy */
if (*s == '<' && s[1] != '\0')
{
int i = n_markups;
struct markup *p = style_markup;
char *s1 = s;
/* Check for "" or "" for all the known tags. */
if (s[1] == '/')
s1++;
while (i--)
{
if (strnicmp(s1 + 1, p->tag, p->tag_len) == 0 &&
s1[p->tag_len + 1] == '>')
{
if (p->replace != -1)
*d = p->replace;
else
d--; /* so the '<' we copied will be overwritten */
s1 += p->tag_len + 1; /* point to the closing '>' */
s = s1;
break;
}
p++; /* check next tag */
}
}
/* --djtitles doesn't allow quotes. */
else if (!quote_djtitles && *d == '"')
*d = '\'';
}
*d = '\0'; /* null-terminate */
}
static int djtitles = 0; /* non-zero if they asked for DJ-style headers */
static char *djtitle_string = ""; /* string at the beginning of banners */
static int eol_style = 0; /* EOL style of the source file (Unix or DOS) */
static const char *eol[] = { "\n", "\r\n" };
/* Write a standard header for an HTML subfile. */
void
write_subfile_header(int fd, int subfile, register char *pos, char *end)
{
static const char html_prologue[] = "";
static const char begin_title[] = "";
static const char end_title[] = "";
static const char dj_header_fmt[]
= "%s";
static const char no_title[] = "'(Untitled)'";
static const char *eol_string;
static size_t eol_len;
char *header;
char *title;
int title_len = sizeof(no_title) - 1;
char *title_str;
/* Use the same EOL style they used in the source file. */
if (eol_string == (char *)0)
{
eol_string = eol[eol_style > 0];
eol_len = strlen(eol_string);
}
title = subfile_title (subfile);
if (title)
title_len = strlen(title);
else
{
fprintf(stderr, "no title found for subfile %d\n", subfile);
title = (char *)no_title;
}
/* Prepend the document name to every title. */
title_str = malloc(title_len + 4 + strlen(djtitle_string) + 1);
if (title_str == (char *)0)
{
errno = ENOMEM;
perror("HTML title");
exit(2);
}
title_str[0] = '\0';
if (djtitle_string && *djtitle_string)
strcat(strcat(title_str, djtitle_string), " -- ");
strncat(title_str, title, title_len);
title_len = strlen(title_str);
title = title_str;
/* Need to strip off all the HTML markup tags in the title, or else they
will show in the header verbatim (since headers are not rendered). */
remove_markup_tags (title);
if (djtitles && quote_djtitles)
{
char *quoted_title;
size_t quoted_title_len;
/* Quote the title, so that characters special to the shell
don't confuse the Web server when it invokes the `header'
script. */
quoted_title_len = quote_title(title_str, NULL); /* how much space? */
quoted_title = malloc(quoted_title_len + 1);
if (quoted_title == (char *)0)
{
errno = ENOMEM;
perror("Quoting HTML title");
exit(2);
}
quote_title(title_str, quoted_title); /* actually quote it */
free(title);
title = quoted_title;
title_len = quoted_title_len;
}
/* Create the header. */
header = malloc((djtitles
/* the magic 6 below is the combined length
of %s and other format specifiers in
dj_header_fmt, which get removed by sprintf. */
? sizeof(dj_header_fmt) - 6 - 1
: sizeof(begin_title) - 1 + sizeof(end_title) - 1)
+ title_len + eol_len);
if (header == (char *)0)
{
errno = ENOMEM;
perror("HTML header");
exit(2);
}
if (djtitles)
sprintf(header, dj_header_fmt, title_len, title, eol_string);
else
sprintf(header, "%s%.*s%s%s",
begin_title, title_len, title, end_title, eol_string);
/* Write it. */
if ((!djtitles
&& (write(fd, html_prologue, sizeof(html_prologue) - 1) <= 0
|| write(fd, eol_string, eol_len) <= 0))
|| write(fd, header, strlen(header)) <= 0)
{
perror("write subfile header");
exit(2);
}
free(header);
if (title != no_title)
free(title);
}
/* Write a standard trailer for an HTML subfile. */
void
write_subfile_trailer(int fd, int subfile, char *pos)
{
static const char end_body[] = "";
static const char end_html[] = "";
static const char dj_trailer[] = "";
static char *trailer;
static size_t trailer_len;
/* Generate the trailer if it wasn't done already (all
trailers are identical). */
if (trailer == (char *)0)
{
/* Use the same EOL style they used in the source file. */
const char *eol_string = eol[eol_style > 0];
trailer = malloc((djtitles ? sizeof(dj_trailer) - 1
: sizeof(end_body) - 1 + sizeof(end_html) - 1
+ strlen(eol_string))
+ strlen(eol_string) + 1);
if (trailer == (char *)0)
{
errno = ENOMEM;
perror("HTML trailer");
exit(2);
}
if (djtitles)
strcat(strcpy(trailer, dj_trailer), eol_string);
else
{
strcat(strcpy(trailer, end_body), eol_string);
strcat(strcat(trailer, end_html), eol_string);
}
trailer_len = strlen(trailer);
}
/* Write the trailer. */
if (write(fd, trailer, trailer_len) < trailer_len)
{
perror("write subfile trailer");
exit(2);
}
}
int
main(int argc, char *argv[])
{
/* Did they ask for a special header and trailer? */
if (argc == 4)
{
if (strncmp(argv[1], "--djtitles", 10) == 0)
{
djtitles = 1;
if (argv[1][10] == '=')
{
djtitle_string = malloc(strlen(argv[1] + 10));
if (djtitle_string)
strcpy(djtitle_string, argv[1] + 11);
}
argv[1] = argv[0];
++argv;
--argc;
}
}
if (!djtitle_string)
djtitle_string = "";
if (argc == 3)
{
int in_fd = open(argv[1], O_RDONLY | O_BINARY);
int out_fd;
long fsize, actual_size;
char *in_file;
char *p, *last_p, *from;
int subfile = 0;
char subfile_name[FILENAME_MAX];
size_t split_pos;
/* How many character positions do we need to back up from the
end of the split_marker in order to get to " in_file; --p)
{
fsize--;
actual_size--;
}
if (fsize < 2048)
{
fprintf(stderr, "%s: too small to bother\n", argv[1]);
return 3;
}
p[1] = '\0';
/* Pass 1: Determine the file positions where the file
will be split, and remember positions of the
destination anchors. */
for (last_p = p, p = in_file; p < last_p; )
{
if (*p == '\n'
&& *++p == split_marker.text[0] && looking_at(&split_marker, p))
{
/* Guess what EOL style they use. */
if (p > in_file + 1 && p[-2] == '\r' && eol_style < INT_MAX)
eol_style++;
else if (eol_style > INT_MIN)
eol_style--;
p += remember_split_pos(p - in_file, subfile);
/* The text of split_marker could include dest_marker as
its substring, in which case we should remember the
destination anchor position as well. */
if (split_marker_backup_to_anchor)
p -= split_marker_backup_to_anchor;
subfile++;
}
else if (*p == dest_marker.text[0] && looking_at(&dest_marker, p))
{
p += remember_dest_pos(p, subfile);
}
else if (*p == heading_marker.text[0]
&& looking_at(&heading_marker, p))
{
p += remember_heading(p, subfile);
}
else
++p;
}
/* Last subfile ends at EOF. */
remember_split_pos(p - in_file, subfile);
subfile++;
/* Pass 2: Generate the subfiles with updated links. */
subfile = 0;
sprintf(subfile_name, "%s.html", argv[2]);
if ((out_fd = open(subfile_name,
O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0666)) == -1)
{
perror(subfile_name);
return 2;
}
split_pos = get_split_pos(subfile);
for (p = in_file, from = p; p < last_p; ++p)
{
const marker_t *marker = NULL;
if (p - in_file >= split_pos) /* time to start another file */
{
if (write(out_fd, from, split_pos - (from - in_file)) <= 0)
{
perror("write at split position");
return 2;
}
from = in_file + split_pos;
/* End up the subfile with a trailer. */
write_subfile_trailer(out_fd, subfile, from);
close(out_fd);
split_pos = get_split_pos(++subfile);
sprintf(subfile_name, "%s%s.html",
argv[2], subfile_section_num(subfile));
if ((out_fd = open(subfile_name,
O_WRONLY | O_CREAT | O_TRUNC | O_BINARY,
0666)) == -1)
{
perror(subfile_name);
return 2;
}
/* Write the HTML title of this subfile. */
write_subfile_header(out_fd, subfile, from, in_file + split_pos);
}
else if (*p == '<'
&& (looking_at(marker = &link_marker, p)
|| looking_at(marker = &rel_marker, p)))
{
int which_file;
p = skip_until_anchor_name(p, marker);
which_file = subfile_num_for_anchor_at_point(p);
--p; /* the `#' character goes AFTER the file */
sprintf(subfile_name, which_file ? "%s%s.html" : "%s.html",
argv[2], subfile_section_num(which_file));
if (write(out_fd, from, p - from) <= 0 ||
write(out_fd, subfile_name, strlen(subfile_name)) <= 0)
{
perror("write at anchor name");
return 2;
}
from = p;
}
}
if (p != from)
if (write(out_fd, from, p - from) <= 0)
{
perror("write at EOF");
return 2;
}
fprintf(stderr, "%s was split into %d file%s\n",
argv[1], subfile + 1, subfile ? "s" : "");
return 0;
}
else
{
fprintf(stderr,
"Usage: %s [--djtitles[=banner]] inputfile outbase\n", *argv);
return 1;
}
}