hebget.sh (5827B)
1 #!/bin/sh 2 # https://github.com/thenewmantis/bbl.git 3 4 # !!!NOTE!!! This file will probably not display as intended if viewed in a web browser. 5 # Web browsers use a feature called "bidi" while displaying text. With bidi, characters from scripts that are read right-to-left will be displayed 6 # in their proper orientation alongside regular (left-to-right) characters. This file itself was written while coping with the fact that Hebrew 7 # must be displayed wrong at times for the purpose of working with text in a sane way. Therefore all strings in this shell script that are written in Hebrew 8 # are actually backwards (written left-to-right) in this file. In the lines (5 lines after this one) where I show example output, I have letters written 9 # right-to-left, as they should be, but web browsers will automatically reverse the display orientation of all Hebrew letters relative to the way they are encoded in files. 10 # The only fully accurate way that I am aware of to view this file is to download and open it in a text editor. 11 12 # This script is intended to pull all verses of the Hebrew Bible from the web into plain text, with one verse per line, in the following format: (e.g.) 13 # בראשית בר א א א בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖יִם וְאֵ֥ת הָאָֽרֶץ׃ 14 # Which, in the command line application that these verses are used for, would produce the following output: 15 # תישארב 16 # ׃ץֶרָֽאָה תֵ֥אְו םִיַ֖מָּׁשַה תֵ֥א םיִ֑הֹלֱא אָ֣רָּב תיִׁ֖שאֵרְּב א:א 17 # (Rendered right-to-left, as is proper.) 18 # The operation of this script is, of course, dependent on the website that hosts the content keeping its URLS and HTML the same, or at least still compatible with the regex used. 19 # Please feel free to modify and reuse this script or another one like it in order to versify any text you find online 20 # Every line in the resulting file should match the following regex (typed exactly as it would be in a Vimscript command (but ignore the surrounding whitespace)): 21 # ^[א-ת]\+\t\%([א-ת]\+ \)\?[א-ת]\+\t[א-ת]\{1,2}\%(\tק\?[א-ת]\{0,2}\&.\+\)\{2}\t\D\+$ 22 # To run, simply run: `./hebget.sh`. This will silently overwrite any file named "h.tsv" in the current directory. 23 24 printf="/usr/bin/env printf" 25 b='01' 26 bAbs=1 27 c=1 28 myFile="hebTemp.txt" 29 tsv="h.tsv" 30 >"$myFile" 31 >"$tsv" 32 33 download() { 34 url="$($printf 'https://mechon-mamre.org/c/ct/c%s%02d.htm' "$b" "$c")" 35 curl -L "$url" -o "$myFile" 36 } 37 nextBook() { 38 # Book 26 is psalms, which has 26, 26a, 26b, 26c, 26d and 26e 39 if echo "$b" | grep -q '^26'; then 40 case "$b" in 41 26a) b='26b' ;; 42 26b) b='26c' ;; 43 26c) b='26d' ;; 44 26d) b='26e' ;; 45 26e) 46 b='27' 47 bAbs=$(( bAbs + 1 )) ;; 48 esac 49 return 50 fi 51 bAbs=$(( bAbs + 1 )) 52 next="$($printf '%02d' "$(($(echo "$b" | grep -o '[1-9][0-9]\?') + 1))")" 53 # These books all have parts a and b (e.g. there is no 25, only 25a and 25b) 54 for n in 08 09 25 35; do 55 if [ "$b" = "${n}a" ]; then 56 b="${n}b" 57 return 58 elif [ "$b" = "${n}b" ]; then 59 if [ "$n" = 08 ]; then 60 b='09a' 61 else 62 b="$next" 63 fi 64 return 65 elif [ "$next" = "$n" ]; then 66 b="${next}a" 67 return 68 fi 69 done 70 b="$next" 71 } 72 getAbbreviation() { 73 case "$1" in 74 "דברי ה*") 75 echo 'ימ';; 76 "שמות") 77 echo 'שת';; 78 "שמואל") 79 echo 'של';; 80 "מלכ*") 81 echo 'מלכ';; 82 "מלא*") 83 echo 'מלא';; 84 "יואל") 85 echo 'יל';; 86 "יונה") 87 echo 'ינ';; 88 *) 89 echo "$1" | grep -o '^..' 90 esac 91 } 92 hebNum() { 93 # Gets the Hebrew numeral corresponding to the integer given--only intended for numbers 1-499 94 case "$1" in 95 0 | 00) ;; 96 ? | 10) 97 $printf "\u$($printf '%04x' $(( 1487 + $1 )))" ;; 98 0?) 99 hebNum "$(echo "$1" | cut -c2-2)" ;; 100 15) 101 $printf 'טו' ;; 102 16) 103 $printf 'טז' ;; 104 ??) 105 dig2="$(hebNum "$(echo "$1" | cut -c2-2)")" 106 addend=1496 107 if [ "$1" -ge 90 ]; then #because of tsade sofit 108 addend=1501 109 elif [ "$1" -ge 80 ]; then #because of pe sofit 110 addend=1500 111 elif [ "$1" -ge 50 ]; then #...because of nun sofit 112 addend=1499 113 elif [ "$1" -ge 40 ]; then #because of mem sofit 114 addend=1498 115 elif [ "$1" -ge 20 ]; then #because of kaf sofit 116 addend=1497 117 fi 118 dig1hex="$($printf '%04x' "$(( addend + $(echo "$1" | cut -c1-1)))")" 119 $printf "\u$dig1hex$dig2" ;; 120 ???) 121 digs="$(hebNum "$(echo "$1" | cut -c2-3)")" 122 dig1hex="$($printf '%04x' "$((1510 + $(echo "$1" | cut -c1-1)))")" 123 $printf "\u$dig1hex$digs" ;; 124 *) ;; 125 esac 126 } 127 128 download 129 while ! grep -qi '404 not found' "$myFile"; do 130 title="$(grep -Po '(?<=<H1>)[^<]*' "$myFile" | cut -d ' ' -f1)" 131 [ title = 'דברי' ] && title='דברי הימים' 132 [ title = 'שיר' ] && title='שיר השירים' 133 abbr="$(getAbbreviation "$title")" 134 bNumHeb="$(hebNum "$bAbs")" 135 while ! grep -qi '404 not found' "$myFile"; do 136 cNumHeb="$(hebNum "$c")" 137 awk "/<B>/{print \"$title\t$abbr\t$bNumHeb\t$cNumHeb\" \$0}" "$myFile" | sed -e 's/<B>\|<\/B> /\t/g' -e 's/\(\S\)<BR>\(\S\)/\1 \2/g' -e 's/<[^>]\+>//g' >> "$tsv" 138 c=$(( c + 1 )) 139 download 140 done 141 nextBook 142 c=1 143 download 144 done