#!/usr/bin/perl ## ## sgmlstripper - Strip SGML markup from input. ## ## by Robert J Seymour ## Copyright 1995, 1996, Robert Seymour and Springer-Verlag. ## All rights reserved. This program may be distributed and/or ## modified in electronic form under the same terms as Perl ## itself. ## ## CPAN menu: # # File Name: sgmlstripper # File Size in BYTES: 1469 # Sender/Author/Poster: Robert J. Seymour # Subject: sgmlstripper - Strip SGML markup from input. # # sgmlstripper removes SGML markup tags from input (taken through # specified files or STDIN). sgmlstripper uses a # character-by-character read mode which, though not as fast as a # regexp, is guaranteed to strip tags which fall across line or # paragraph boundaries and preserves whitespace so that line numbers # will be the same (the latter is useful for search engines which # don't want to index markup, but want line numbers to be preserved). ## Use STDIN if no files are given $ARGV[0] = "-" unless @ARGV; ## Strip out anything contained in an SGML markup tag. This is not ## very pretty and rather inefficient, but it does take care of tags ## which cross line or paragraph boundaries. foreach $file (@ARGV) { open(INPUT,$file); while($char = getc(INPUT)) { if($char eq "<") { IGNORE: for(;;) { last IGNORE if (getc(INPUT) eq ">"); } } else { print $char; } } close(INPUT); }