txt2html
Wie angedeutet besteht mein Hobby darin, vergriffene Bücher zu scannen und nach Epub zu konvertieren. Der Scanvorgang (Tesseract) produziert ein Textdokument mit „abgetippten“ Einzelzeilen. Zum Zusammenfügen der Silbentrennung dient das Tool txt2html.tcl
. Einzelne Kommentare und Metadaten im produzierten HTML dienen meinem persönlichen Bedarf, stören aber nicht weiter.
#!/usr/bin/tclsh lassign $argv infile outfile proc echo args {puts $args} proc cat {file {encoding ""}} { # return contents of $file set port [open $file] if {$encoding ne ""} then { fconfigure $port -encoding $encoding } set contents [read $port] close $port set contents } proc saveString {string file} { set port [open $file w] puts -nonewline $port $string close $port } proc swiss? txt { set ltIndices [regexp -inline -indices {[«‹<]} $txt] if {$ltIndices eq ""} then { return false } else { lassign {*}$ltIndices ltIdx } set gtIndices [regexp -inline -indices {[»›>]} $txt] if {$gtIndices eq ""} then { return false } else { lassign {*}$gtIndices gtIdx } if {$ltIdx < $gtIdx} then { return true } else { return false } } regsub -all {[-–—]+} [cat $infile] - txt regsub -all {(\s*[.]){2,}} $txt " ..." txt set txt [string map "\ufb00 ff \ufb01 fi \ufb02 fl \ufb03 ffi \ufb04 ffl" $txt] if {[swiss? $txt]} then { set txt [string map {>>> ›» <<< «‹ >> » << « > › < ‹ & &} $txt] } else { set txt [string map {>>> »› <<< ‹« >> » << « > › < ‹ & &} $txt] } # regsub -all {(?:\s+[[:punct:]|lI])+\n} $txt \n txt # regsub -all {\n(?:[[:punct:]|lI]\s+)+} $txt \n txt regsub -all {\n +} $txt \n txt regsub -all {\n{3,}} $txt \n\n txt regsub -all {([AEIOUÄÖÜaeiouäöü])k-\nk([aeiouäöü])} $txt {\1ck\2} txt regsub -all {([a-zäöüß])-\n([a-zäöüß])} $txt {\1\2} txt regsub -all {(\S) *\n(\S)} $txt {\1 \2} txt regsub -all {([[:punct:][:space:]])-([[:space:][:punct:]])} $txt {\1–\2} txt regsub -all {\[(\S+)\s+"([^"]*)"\]} $txt {<a href="\1">\2</a>} txt set txt "<p> [string map [list \n\n " </p>\n\n<p> "] [string trim $txt]] </p>" append top {<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="de" lang="de">}\ \n <head> \n\ {<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />} \n\ <title> \n\ [file tail [file root $infile]] \n\ </title> \n\ {<meta name="author" content="Unknown" />} \n\n\ {<meta name="DC.language" content="de" scheme="DCTERMS.RFC3066" />} \n\n\ {<meta name="splitlevel" content="1" />} \n\n\ {<meta name="wordmap" content="" />} \n\n\ {<meta name="stringmap" content=" ʼ ’\ ₀ <sub>0</sub> ₁ <sub>1</sub> ₂ <sub>2</sub> ₃ <sub>3</sub> ₄ <sub>4</sub>\ ₅ <sub>5</sub> ₆ <sub>6</sub> ₇ <sub>7</sub> ₈ <sub>8</sub> ₉ <sub>9</sub>\ ⁰ <sup>0</sup> ¹ <sup>1</sup> ² <sup>2</sup> ³ <sup>3</sup> ⁴ <sup>4</sup>\ ⁵ <sup>5</sup> ⁶ <sup>6</sup> ⁷ <sup>7</sup> ⁸ <sup>8</sup> ⁹ <sup>9</sup> "} \n\n\ {<meta name="hyphen" content="" />} \n\n\ {<style type=text/css> #cover-image img, p+img { display: block; max-height: 100%; max-width: 100%; } hr { border-width: 0px; height: 0px; } h1 { line-height: 120%; border-width: 1pt; border-bottom-style: solid; padding-bottom: 1em; width: auto; } h1, h2, h3, h4, h5, h6 { text-align: center; page-break-before: always; page-break-after: avoid; font-size: 100%; } h1 + div + h2, h2 + div + h3, h3 + div + h4, h4 + div + h5, h5 + div + h6, h1 + h2, h2 + h3, h3 + h4, h4 + h5, h5 + h6 { page-break-before: avoid; } p, li { margin-top: 0.5em; margin-bottom: 0px; } p { -moz-hyphens: auto; -o-hyphens: auto; -webkit-hyphens: auto; -ms-hyphens: auto; hyphens: auto; text-align: justify; } li { text-align: justify; } body > div > div > p+p, blockquote > p+p, body > p+p, li > p+p { text-indent: 1.25em; margin-top: 0px; } p+* { margin-top: 0em; } tt, code, pre { font-size: 0.9em; } pre { line-height: 0.9em; page-break-inside: avoid; } body { padding: 0.5em; } sup { font-size: 80%; display: inline; line-height: 50%; position: relative; top: -0.1em; } ol[type="a"] { list-style-type: lower-alpha; } ol[type="1"] { list-style-type: decimal; } ul, ol, dl, blockquote { margin-top: 0.5em; margin-bottom: 0.5em; margin-right: 0em; } dd { page-break-before: avoid; } p+blockquote { page-break-before: avoid; } div p { page-break-inside: avoid; } div p.ebenda { page-break-before: avoid; } </style>} \n\n\ </head> \n {<!-- - Ligaturen - Staub - Ziffern zwischen Buchstaben - Öffnende, - Schließende Gänsefüßchen - Satz-Enden - Seitenwechsel - Kommentare innerhalb von Wörtern - Rechtschreibkontrolle - Korrekturlesen - Abschnitte - Hervorhebungen -->} \n {<body>} \n append bottom \n </body> \n </html> set txt $top$txt$bottom if {$outfile ne ""} then { saveString $txt $outfile } else { puts $txt }
19.10.2022
<< | Heimatseite | Verzeichnis | Stichworte | Autor | >>