txt2html

Wie angedeutet besteht mein Hobby darin, vergriffene Bücher zu scannen und nach Epub zu konvertieren. Der Scanvorgang (Tesseract) produziert ein Textdokument mit „abgetippten“ Einzelzeilen. Zum Zusammenfügen der Silbentrennung dient das Tool txt2html.tcl. Einzelne Kommentare und Metadaten im produzierten HTML dienen meinem persönlichen Bedarf, stören aber nicht weiter.


#!/usr/bin/tclsh

lassign $argv infile outfile

proc echo args {puts $args}

proc cat {file {encoding ""}} {
    # return contents of $file
    set port [open $file]
    if {$encoding ne ""} then {
	fconfigure $port -encoding $encoding
    }
    set contents [read $port]
    close $port
    set contents
}

proc saveString {string file} {
    set port [open $file w]
    puts -nonewline $port $string
    close $port	
}

proc swiss? txt {
  set ltIndices [regexp -inline -indices {[«‹<]} $txt]
  if {$ltIndices eq ""} then {
    return false
  } else {
    lassign {*}$ltIndices ltIdx
  }
  set gtIndices [regexp -inline -indices {[»›>]} $txt]
  if {$gtIndices eq ""} then {
    return false
  } else {
    lassign {*}$gtIndices gtIdx
  }
  if {$ltIdx < $gtIdx} then {
    return true
  } else {
    return false
  }
}

regsub -all {[-–—]+} [cat $infile] - txt

regsub -all {(\s*[.]){2,}} $txt " ..." txt

set txt [string map "\ufb00 ff \ufb01 fi \ufb02 fl \ufb03 ffi \ufb04 ffl" $txt]

if {[swiss? $txt]} then {
  set txt [string map {>>> ›» <<< «‹ >> » << « > › < ‹ & &amp;} $txt]
} else {
  set txt [string map {>>> »› <<< ‹« >> » << « > › < ‹ & &amp;} $txt]
}

# regsub -all {(?:\s+[[:punct:]|lI])+\n} $txt \n txt

# regsub -all {\n(?:[[:punct:]|lI]\s+)+} $txt \n txt

regsub -all {\n +} $txt \n txt

regsub -all {\n{3,}} $txt \n\n txt

regsub -all {([AEIOUÄÖÜaeiouäöü])k-\nk([aeiouäöü])} $txt {\1ck\2} txt

regsub -all {([a-zäöüß])-\n([a-zäöüß])} $txt {\1\2} txt

regsub -all {(\S) *\n(\S)} $txt {\1 \2} txt

regsub -all {([[:punct:][:space:]])-([[:space:][:punct:]])} $txt {\1–\2} txt

regsub -all {\[(\S+)\s+"([^"]*)"\]} $txt {<a href="\1">\2</a>} txt

set txt "<p> [string map [list \n\n " </p>\n\n<p> "] [string trim $txt]] </p>"

append top {<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="de" lang="de">}\
  \n <head> \n\
  {<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />} \n\
  <title> \n\
  [file tail [file root $infile]] \n\
  </title> \n\
  {<meta name="author" content="Unknown" />} \n\n\
  {<meta name="DC.language" content="de" scheme="DCTERMS.RFC3066" />} \n\n\
  {<meta name="splitlevel" content="1" />} \n\n\
  {<meta name="wordmap" content="" />} \n\n\
  {<meta name="stringmap" content=" ʼ ’\
    ₀ <sub>0</sub> ₁ <sub>1</sub> ₂ <sub>2</sub> ₃ <sub>3</sub> ₄ <sub>4</sub>\
    ₅ <sub>5</sub> ₆ <sub>6</sub> ₇ <sub>7</sub> ₈ <sub>8</sub> ₉ <sub>9</sub>\
    ⁰ <sup>0</sup> ¹ <sup>1</sup> ² <sup>2</sup> ³ <sup>3</sup> ⁴ <sup>4</sup>\
    ⁵ <sup>5</sup> ⁶ <sup>6</sup> ⁷ <sup>7</sup> ⁸ <sup>8</sup> ⁹ <sup>9</sup> "} \n\n\
  {<meta name="hyphen" content="" />} \n\n\
  {<style type=text/css>
#cover-image img,
p+img {
	display: block;
	max-height: 100%;
	max-width: 100%;
}

hr {
	border-width: 0px;
	height: 0px;
}

h1 {
	line-height: 120%;
	border-width: 1pt;
	border-bottom-style: solid;
	padding-bottom: 1em;
	width: auto;
}

h1, h2, h3, h4, h5, h6 {
	text-align: center;
	page-break-before: always;
	page-break-after: avoid;
	font-size: 100%;
}

h1 + div + h2, h2 + div + h3, 
h3 + div + h4, h4 + div + h5, h5 + div + h6,
h1 + h2, h2 + h3, h3 + h4, h4 + h5, h5 + h6 {
	page-break-before: avoid;
}

p, li {
    margin-top: 0.5em;
    margin-bottom: 0px;
}

p {
    -moz-hyphens: auto;
    -o-hyphens: auto;
    -webkit-hyphens: auto;
    -ms-hyphens: auto;
    hyphens: auto;
    text-align: justify;
}

li {
    text-align: justify;
}

body > div > div > p+p,
blockquote > p+p,
body > p+p,
li > p+p {
	text-indent: 1.25em;
	margin-top: 0px;
}

p+* {
	margin-top: 0em;
}

tt, code, pre {
	font-size: 0.9em;
}

pre {
	line-height: 0.9em;
	page-break-inside: avoid; 
}

body {
    padding: 0.5em;
}

sup {
	font-size: 80%;
	display: inline;
	line-height: 50%;
	position: relative;
	top: -0.1em;
}

ol[type="a"] {
	list-style-type: lower-alpha;
}

ol[type="1"] {
	list-style-type: decimal;
}

ul, ol, dl, blockquote {
	margin-top: 0.5em;
	margin-bottom: 0.5em;
	margin-right: 0em;
}

dd {
	page-break-before: avoid;
}

p+blockquote {
	page-break-before: avoid;
}

div p {
	page-break-inside: avoid;
}

div p.ebenda {
	page-break-before: avoid;
}
</style>} \n\n\
  </head> \n {<!--
 - Ligaturen
 - Staub
 - Ziffern zwischen Buchstaben
 - Öffnende,
 - Schließende Gänsefüßchen
 - Satz-Enden
 - Seitenwechsel
 - Kommentare innerhalb von Wörtern
 - Rechtschreibkontrolle
 - Korrekturlesen
 - Abschnitte
 - Hervorhebungen
-->} \n {<body>} \n
append bottom \n </body> \n </html>

set txt $top$txt$bottom

if {$outfile ne ""} then {
  saveString $txt $outfile
} else {
  puts $txt
}

19.10.2022