/* .................................................... documentation ...
 *
 * Bug fixes by: Jari Aalto <jaalto@tre.tele.nokia.fi>
 * Bug fixes to the bug fixes: Neal Becker <neal@ctd.comsat.com>
 *
 * You probably want to #define LATIN1 (commented out), otherwise you
 * get cyrillic code page translations.  I think that's the only
 * user-servicable part.  The author's original message follows:
 *
 * --------------cut-here-----------------------------------------------
 * From vitus@agropc.msk.su Fri Dec 27 20:47:09 1996
 * From: "Victor B. Wagner" <vitus@agropc.msk.su>
 * Newsgroups: comp.os.linux.misc,comp.os.linux.development.apps,
 *             comp.unix.programmer
 * Subject: CATDOC ver 0.3 (MS-Word reader for UNIX and DOS)
 *          released (source here)
 * Date: 27 Dec 1996 16:47:30 +0300
 * Organization: unknown
 * Lines: 251
 * Sender: news-server@news.demos.su
 * Distribution: su
 * Message-ID: <199612271347.QAA04371@softweyr.agropc.msk.su>
 * Reply-To: vitus@agropc.msk.su
 * NNTP-Posting-Host: news.demos.su
 * X-Return-Path: news.demos.su!kremvax.demos.su!agropc!agropc.msk.su!vitus
 * Xref: localhost comp.os.linux.misc:20725 comp.os.linux.development.apps:5415
 *
 * Hi, All!
 *
 * I've just finished third release of MS-Word to ASCII converter.
 * New features:
 *
 * 1. Some Word character can be converted into TeX control sequences
 * 2. Optionally, program can return exit code 1, if file is not in MS-Word
 *    format (application of this feature see below)
 * 3. Source code is completely rewritten to achieve more clarity.
 *
 * Missing features:
 *
 * 1. fast saves still not handled correctly.
 * 2. footnotes are viewed as separate paragrapghs at the end of text without
 *   any link to their marks (footnote marks are silently deleted)
 * 3. Nothing done with embedded illustrations and OLE objects.
 * 4. Reserved sign is translated just to (R) instead of correct TeX sequence
 * 5. Some garbage is displayed at the end (and possible at the start)
 *   of document
 *
 * Usage notes
 *
 * -t switch causes replacing of special symbols such as em-dash by
 *    TeX (LaTeX) commands instead of ASCII printable equivalents
 * -a disables effect of prevouisly specified -t
 *
 *
 * -s switch: if program cannot find MS-Word signature before
 * first pritable paragraph, it exits with code 1, supposing that it is
 * just plain text which has .doc suffix only by coincedence.
 *
 * In case of errors it returns exit code 2.
 *
 * So I write following command file to view doc files in DOS environment:
 *
 * ------------------
 * @echo off
 * rem docview.bat - viewer for files with DOC extension
 * rem uses Norton's WPVIEW.EXE
 * catdoc -s %1>%TEMP%\docview.tmp
 * if errorlevel 2 goto quit
 * if errorlevel 1 copy $1 %TEMP^%\docview.tmp
 * rem this is not winword file. Probably it is viewable without special effort
 * wpview %TEMP%\docview.tmp
 * :quit
 * del %TEMP%\docview.tmp
 * ---------------
 *
 * If you want to use this program as filter (I wonder, who would supply
 * Word file for stdin), just use dash '-' instead of file name
 *
 * Program text follows:
 *
 *
 */

/* .................................................... program start ... */

/* catdoc.c version 0.3
 *
 * $Id
 * $Version$
 *
 *
 */

/* .......................................................... include ... */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

/* ........................................................... define ... */

#define TEXT_WIDTH 72
#define LATIN1


/* ......................................................... charsets ... */

/* enable this define, if you don't want cyrillic code page translations */

char specs[] =
{ 7, /* tab columns separator - handled specially*/
      '\n',/* hook to handle end of line in tables */
      0x1E,/* unbreakable defis */
      0x1F,/* soft hyphen */
      0x85,/* dots */
      0x91,/* opening single quote */
      0x92,/* closing single quote */
      0x93,/* opening double quote */
      0x94,/* closing double quote */
      0x96,/* em-dash (or em-space)*/
      0x97,/* en-dash */
      0x99,/* Trade Mark sign */
      0xA0,/* unbreakable space */
      0xA9,/* Copyright sign */
      0xAE,/* Reserved sign */
      0xAB,/* opening << quote*/
      0xBB,/* closing >> quote*/
      /* The rest is translated into itself unless TeX mode is selected */
      '%','$','_','{','}','\\',
};

char *ascii_specs[]=
{
    "\t","\n","-","","...","`","'","``","''","-","-","tm",
    " ","(c)","(R)","\"","\"","%","$","_","{","}","\\"
};


char *TeX_specs[]=
{
    "\t&","\\\\\n","-","\\-","\\dots{}","`","'","``","''","---","--",
    "${}^{\\scriptscriptstyle\\mathrm{TM}}$",/* this is my idea about tm sign*/
    "~",
    "{\\copyright}",
    "(R)",/* to be replaced with correct command */
    "<",">","\\%","\\$","$\\{$","$\\}$","$\\backslash$",
};


#ifndef LATIN1
#ifdef unix


unsigned char table[256]=
{
/* Windows cyrillic code page to KOI-8 */
0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0D,0x0C,0x0D,0x0E,0x0F,
0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x2D,0x20,
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
0x80,0x81,0x82,0xAA,0x8F,0x90,0xA9,0x93,0x84,0x92,0x91,0x94,0x83,0x95,0x99,0x8B,
0x98,0x60,0x27,0x22,0x22,0x9A,0x2D,0x2D,0x9E,0xA6,0x87,0xB0,0x8D,0x97,0x86,0xA2,
0x20,0xA7,0xA5,0x88,0xA4,0x8E,0x96,0x85,0xB3,0xA1,0x9F,0x22,0xAB,0xAC,0xAD,0xAE,
0xAF,0xB2,0xB1,'i',0xB5,0xB6,0xB7,0xB8,0xA3,0xB9,0xBA,0x22,0xBC,0xBD,0xBE,0x9B,
0xE1,0xE2,0xF7,0xE7,0xE4,0xE5,0xF6,0xFA,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,0xF0,
0xF2,0xF3,0xF4,0xF5,0xE6,0xE8,0xE3,0xFE,0xFB,0xFD,0xFF,0xF9,0xF8,0xFC,0xE0,0xF1,
0xC1,0xC2,0xD7,0xC7,0xC4,0xC5,0xD6,0xDA,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,0xD0,
0xD2,0xD3,0xD4,0xD5,0xC6,0xC8,0xC3,0xDE,0xDB,0xDD,0xDF,0xD9,0xD8,0xDC,0xC0,0xD1};


#else

unsigned char table[256]=
{
0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0D,0x0c,0x0d,0x0e,0x0f,
0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x2D,0x20,
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f,
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,0x4e,0x4f,
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,0x5b,0x5c,0x5d,0x5e,0x5f,
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x7b,0x7c,0x7d,0x7e,0x7f,
0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,
0x90,0x60,0x27,0x22,0x22,0x95,0x2D,0x2D,0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f,
0x20,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0x22,0xac,0xad,0xae,0xaf,
0xb0,0xb1,0xb2,0xb3,'i',0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0x22,0xbc,0xbd,0xbe,0xbf,
0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,
0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f,
0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,
0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef};

#endif

#define recode_char(x) table[x]

#else
#define recode_char(x) x
#endif


/* ............................................................. func ... */
char *map_char(char **map,int c)
{
    static	char	buffer[2]="a";
         	char	*ptr;

    if ( ( ptr = strchr( specs, c)) )
    {
	return map[ ptr - specs ];
    }
    else
    {
	buffer[0]=recode_char(c);
	return buffer;
    }
}


/* ............................................................. func ... */
void format( char *buf, char **map)
{
    char    outstring[128];
    char    *sp = buf, *dp;
    int	    table = 0;

    outstring[0] = '\0';			/* clear as "" */

    while (*sp)
    {
	if (*sp==7&&table)
	{
	    printf("%s%s",outstring,map_char(map,'\n'));
	    outstring[0]=0;
	    table=0;sp++;
	}
	else
	{
	    if ( strlen( strcat( outstring, map_char( map ,*sp))) > TEXT_WIDTH)
	    {
		dp = strrchr(outstring,' ');
		if (dp)
		{
		    *(dp++)=0;
		    printf("%s\n",outstring);
		    strcpy(outstring,dp);
		}
		else
		{
		    int i;
		    for(i=0;i<72;i++) putc(outstring[i],stdout);
		    putc('\n',stdout);
		    strcpy(outstring,outstring+72);
		}
	    }
	    table=*(sp++)==7;
	}
    }
    if (outstring[0]==0)
	putc('\n',stdout);
    else
	printf("%s\n\n",outstring);
}

/* ............................................................. func ... */
void help(void)
{
    printf(
    "catdoc - exctract text from MS-Word files and catenate it to stdout\n"
    "Copyright (c) by Victor B. Wagner, 1996\n"
    "Usage catdoc [-ast] files ...\n"
    "\t-a - converts non-standard printable chars into readable form (default)\n"
    "\t-t - converts them into TeX control sequences\n"
    "\t-s - exits with code 1 if MSWordDoc signature not found before\n"
    "\t\tfirst printable paragraph\n\n"
    "All options affects only files, specified AFTER them\n"
	   );
    exit(2);
}

/* ............................................................. func ... */

char buf[8192];

void do_file(FILE *f, char **map, int search_sign)
{
    int ok =! search_sign;
    int bufptr, c;

    while( !feof(f) )
    {
	bufptr = -1;

	do {

	    c = getc(f);

	    /* Special printable symbols 7- table separator
             *
             * \r   - paragraph end
             * 0x1E - short defis
             *
             */

	    if ((c<=255&&c>=32)||c==7||c=='\t'||c=='\r'||c==0x1E)
		buf[++bufptr]=c;
	    else
		if (c==0x0b)
		    buf[++bufptr]='\r';
		else
		{
		    if (!c)
		    {
			buf[++bufptr]=0;
			if(!strcmp(buf,"MSWordDoc"))
			{
			    ok=1;
			}
		    }
		    if (c!=2)
			bufptr=-1;/* \002 is Word's footnote mark */
		}
	}

	while (c!='\r'&&c!=EOF);

	if (bufptr>0&&buf[bufptr]=='\r')
	{
	    if (!ok)
		exit( 1);
	    buf[bufptr]=0;
	    format(buf,map);
	}
    }
}

/* ............................................................. func ... */

int main(int argc,char **argv)
{
    /* earch_sign:
     *     Must program exit with exit code 1 if MSWordDoc
     *     signature is not found?
     *
     * sequences:
     *     pointer to array of character sequences
     *     to represent special characters of Word
     */


    int	    search_sign = 0;
    char    **sequences = ascii_specs;
    int	    i= 1,
	    stdin_processed=0;

    if (argc<2)
    {
	help();
    }

    for(;i<argc;i++)
    {
	if (!strcmp(argv[i],"-s"))
	    search_sign=1;
	else if (!strcmp(argv[i],"-t"))
	    sequences=TeX_specs;
	else if (!strcmp(argv[i],"-a"))
	    sequences=ascii_specs;
	else if (!strcmp(argv[i],"-"))
	    if (!stdin_processed)
	    {
		do_file(stdin,sequences,search_sign);
		stdin_processed=1;
	    }
	    else
	    {
		fprintf(stderr,"Cannot process standard input twice a row\n");
                exit (2);
	    }
	else if (argv[i][0]=='-')
	{
	    fprintf(stderr,"Invalid option %s\n",argv[i]);
	    help();
	}
	else
	{
	    FILE *f=fopen(argv[i],"r");
	    if(!f)
	    {
		fprintf(stderr,"Cannot open file %s\n",argv[i]);
		exit(2);
	    }
	    do_file(f,sequences,search_sign);
	}
    }
    return 0;
}

/* end of file */