bbl

Read, search and index the Bible on the command line -- Greek, Latin, KJV, Knox, RSV, and more
git clone git://git.wilsonrgheen.com/bbl
Log | Files | Refs | README | LICENSE

hebget.sh (5827B)


      1 #!/bin/sh
      2 # https://github.com/thenewmantis/bbl.git
      3 
      4 # !!!NOTE!!! This file will probably not display as intended if viewed in a web browser.
      5 # Web browsers use a feature called "bidi" while displaying text. With bidi, characters from scripts that are read right-to-left will be displayed
      6 # in their proper orientation alongside regular (left-to-right) characters. This file itself was written while coping with the fact that Hebrew
      7 # must be displayed wrong at times for the purpose of working with text in a sane way. Therefore all strings in this shell script that are written in Hebrew
      8 # are actually backwards (written left-to-right) in this file. In the lines (5 lines after this one) where I show example output, I have letters written
      9 # right-to-left, as they should be, but web browsers will automatically reverse the display orientation of all Hebrew letters relative to the way they are encoded in files.
     10 # The only fully accurate way that I am aware of to view this file is to download and open it in a text editor.
     11 
     12 # This script is intended to pull all verses of the Hebrew Bible from the web into plain text, with one verse per line, in the following format: (e.g.)
     13 # בראשית	בר	א	א	א	בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖יִם וְאֵ֥ת הָאָֽרֶץ׃
     14 # Which, in the command line application that these verses are used for, would produce the following output:
     15 # תישארב
     16 # ׃ץֶרָֽאָה תֵ֥אְו םִיַ֖מָּׁשַה תֵ֥א םיִ֑הֹלֱא אָ֣רָּב תיִׁ֖שאֵרְּב	א:א
     17 # (Rendered right-to-left, as is proper.)
     18 # The operation of this script is, of course, dependent on the website that hosts the content keeping its URLS and HTML the same, or at least still compatible with the regex used.
     19 # Please feel free to modify and reuse this script or another one like it in order to versify any text you find online
     20 # Every line in the resulting file should match the following regex (typed exactly as it would be in a Vimscript command (but ignore the surrounding whitespace)):
     21 #                      ^[א-ת]\+\t\%([א-ת]\+ \)\?[א-ת]\+\t[א-ת]\{1,2}\%(\tק\?[א-ת]\{0,2}\&.\+\)\{2}\t\D\+$
     22 # To run, simply run: `./hebget.sh`. This will silently overwrite any file named "h.tsv" in the current directory.
     23 
     24 printf="/usr/bin/env printf"
     25 b='01'
     26 bAbs=1
     27 c=1
     28 myFile="hebTemp.txt"
     29 tsv="h.tsv"
     30 >"$myFile"
     31 >"$tsv"
     32 
     33 download() {
     34     url="$($printf 'https://mechon-mamre.org/c/ct/c%s%02d.htm' "$b" "$c")"
     35     curl -L "$url" -o "$myFile"
     36 }
     37 nextBook() {
     38     # Book 26 is psalms, which has 26, 26a, 26b, 26c, 26d and 26e
     39     if echo "$b" | grep -q '^26'; then
     40        case "$b" in
     41            26a) b='26b' ;;
     42            26b) b='26c' ;;
     43            26c) b='26d' ;;
     44            26d) b='26e' ;;
     45            26e)
     46                b='27'
     47                bAbs=$(( bAbs + 1 )) ;;
     48        esac
     49        return
     50     fi
     51     bAbs=$(( bAbs + 1 ))
     52     next="$($printf '%02d' "$(($(echo "$b" | grep -o '[1-9][0-9]\?') + 1))")"
     53     # These books all have parts a and b (e.g. there is no 25, only 25a and 25b)
     54     for n in 08 09 25 35; do
     55         if [ "$b" = "${n}a" ]; then
     56             b="${n}b"
     57             return
     58         elif [ "$b" = "${n}b" ]; then
     59             if [ "$n" = 08 ]; then
     60                 b='09a'
     61             else
     62                 b="$next"
     63             fi
     64             return
     65         elif [ "$next" = "$n" ]; then
     66             b="${next}a"
     67             return
     68         fi
     69     done
     70     b="$next"
     71 }
     72 getAbbreviation() {
     73     case "$1" in
     74         "דברי ה*")
     75             echo 'ימ';;
     76         "שמות")
     77             echo 'שת';;
     78         "שמואל")
     79             echo 'של';;
     80         "מלכ*")
     81             echo 'מלכ';;
     82         "מלא*")
     83             echo 'מלא';;
     84         "יואל")
     85             echo 'יל';;
     86         "יונה")
     87             echo 'ינ';;
     88         *)
     89             echo "$1" | grep -o '^..'
     90     esac
     91 }
     92 hebNum() {
     93     # Gets the Hebrew numeral corresponding to the integer given--only intended for numbers 1-499
     94     case "$1" in
     95         0 | 00) ;;
     96         ? | 10)
     97             $printf "\u$($printf '%04x' $(( 1487 + $1 )))" ;;
     98         0?)
     99             hebNum "$(echo "$1" | cut -c2-2)" ;;
    100         15)
    101             $printf 'טו' ;;
    102         16)
    103             $printf 'טז' ;;
    104         ??)
    105             dig2="$(hebNum "$(echo "$1" | cut -c2-2)")"
    106             addend=1496
    107             if [ "$1" -ge 90 ]; then #because of tsade sofit
    108                 addend=1501
    109             elif [ "$1" -ge 80 ]; then #because of pe sofit
    110                 addend=1500
    111             elif [ "$1" -ge 50 ]; then #...because of nun sofit
    112                 addend=1499
    113             elif [ "$1" -ge 40 ]; then #because of mem sofit
    114                 addend=1498
    115             elif [ "$1" -ge 20 ]; then #because of kaf sofit
    116                 addend=1497
    117             fi
    118             dig1hex="$($printf '%04x' "$(( addend + $(echo "$1" | cut -c1-1)))")"
    119             $printf "\u$dig1hex$dig2" ;;
    120         ???)
    121             digs="$(hebNum "$(echo "$1" | cut -c2-3)")"
    122             dig1hex="$($printf '%04x' "$((1510 + $(echo "$1" | cut -c1-1)))")"
    123             $printf "\u$dig1hex$digs" ;;
    124         *) ;;
    125     esac
    126 }
    127 
    128 download
    129 while ! grep -qi '404 not found' "$myFile"; do
    130     title="$(grep -Po '(?<=<H1>)[^<]*' "$myFile" | cut -d ' ' -f1)"
    131     [ title = 'דברי' ] && title='דברי הימים'
    132     [ title = 'שיר' ] && title='שיר השירים'
    133     abbr="$(getAbbreviation "$title")"
    134     bNumHeb="$(hebNum "$bAbs")"
    135     while ! grep -qi '404 not found' "$myFile"; do
    136         cNumHeb="$(hebNum "$c")"
    137         awk "/<B>/{print \"$title\t$abbr\t$bNumHeb\t$cNumHeb\" \$0}" "$myFile" | sed -e 's/<B>\|<\/B> /\t/g' -e 's/\(\S\)<BR>\(\S\)/\1 \2/g' -e 's/<[^>]\+>//g' >> "$tsv"
    138         c=$(( c + 1 ))
    139         download
    140     done
    141     nextBook
    142     c=1
    143     download
    144 done