Posted By

ascarion on 03/15/10


Tagged

Bash html diff compare


Versions (?)

Compare the state of web pages with a previous state


 / Published in: Bash
 

This can be used to check if an update of a CMS (or blog, plugin, ...) changed something in the frontend (content or HTML).

Run the script with the file that contains the urls as parameter:

wiff.sh urllist.txt

urllist.txt contains the urls one per line.

The snapshots are stored in "~/wiff" by default.

You can use diff instead of wdiff: wiff.sh -c diff urllist.txt

  1. #!/usr/bin/env bash
  2.  
  3. VER="0.10"
  4. DIR=~/wiff
  5. CMD="$(which wdiff) -3" # Show only differences
  6. CURL=$(which curl)
  7. CURLOPT="--globoff"
  8. MD5=$(which md5sum)
  9. INTERACTIVE=true
  10. PAUSE=false
  11. DIFFONLY=false
  12. ISBSD=false
  13.  
  14. # Can't set "-e", diff returns exit code > 0
  15. set +e
  16.  
  17. usage () {
  18. sed 's/^ //' >&2 << EOF
  19.  
  20. wiff, version $VER - Save and compare states of web pages.
  21.  
  22. Usage: $0 [options] file
  23.  
  24. "file" must contain the list of urls to be compared. When the command runs
  25. for the first time on an url list, the current state is saved. The following
  26. runs compare the initial state to the current state.
  27. The snapshots are stored in $DIR.
  28.  
  29. Options:
  30. -c command
  31. Diff command. Default: $CMD
  32. -d directory
  33. Set root directory. Default: $DIR
  34. -n
  35. Non-interactive mode. Compares the first snapshot to the latest.
  36. -o
  37. Don't download the URLs, just compare two states.
  38. -p
  39. Pause (2 sec.) after each download.
  40. -h
  41. Show this message.
  42.  
  43. EOF
  44. }
  45.  
  46. ts2date () {
  47. if $ISBSD; then
  48. date -j -f "%s" $1 "%a %b %d %T %Z %Y"
  49. else
  50. date -d "1970-01-01 $1 sec"
  51. fi
  52. }
  53.  
  54. while getopts "c:d:noph" OPTION; do
  55. case $OPTION in
  56. c) CMD="$OPTARG"
  57. ;;
  58. d) DIR="$OPTARG"
  59. ;;
  60. n) INTERACTIVE=false
  61. ;;
  62. o) DIFFONLY=true
  63. ;;
  64. p) PAUSE=true
  65. ;;
  66. [h?])
  67. usage
  68. exit 1
  69. ;;
  70. esac
  71. done
  72.  
  73. shift $(($OPTIND - 1))
  74. INPUTFILE=$*
  75.  
  76. # System is BSD?
  77. date --version >/dev/null 2>&1
  78. [ $? -gt 0 ] && ISBSD=true
  79.  
  80. # Check required programs
  81. # md5 for Mac OS X (must run in quiet mode)
  82. [ -z "$MD5" ] && MD5="$(which md5) -q"
  83. if [ -z "$MD5" ] || [ -z "${CMD%% *}" ] || [ -z "${CURL%% *}" ]; then
  84. echo "Required programs not available" >&2
  85. exit 4
  86. fi
  87.  
  88. if [ -z "$INPUTFILE" ]; then
  89. echo "No files specified" >&2
  90. usage
  91. exit 2
  92. fi
  93.  
  94. [ ! -d "$DIR" ] && mkdir -p "$DIR"
  95.  
  96.  
  97. if [ ! -r "$INPUTFILE" ]; then
  98. echo "$INPUTFILE does not exist or is not readable" >&2
  99. exit 3
  100. fi
  101. FILEHASH=$($MD5 "$INPUTFILE" | tr -d " -") # remove space and dash for OS X md5
  102. FILEHASH=${FILEHASH%%$INPUTFILE} # remove input file name from hash string (md5sum)
  103. HDIR="$DIR/$FILEHASH"
  104. [ ! -d "$HDIR" ] && mkdir "$HDIR"
  105. [ ! -f "$HDIR/$INPUTFILE" ] && cp "$INPUTFILE" "$HDIR"
  106. VERSION=1
  107. while [ -f "$HDIR/$VERSION" ]; do
  108. VERSION=$(($VERSION + 1))
  109. done
  110. if ! $DIFFONLY; then
  111. date +%s >"$HDIR/$VERSION"
  112. for URL in $(cat "$INPUTFILE"); do
  113. URLHASH=$($MD5 <<<"$URL" | tr -d " -")
  114. echo "Downloading $URL ..."
  115. $CURL $CURLOPT "$URL" >"$HDIR/$URLHASH.$VERSION"
  116. $PAUSE && sleep 2
  117. done
  118. else
  119. VERSION=$(($VERSION - 1))
  120. fi
  121. if [ $VERSION -gt 1 ]; then
  122. if $INTERACTIVE; then
  123. echo -e "\nSaved versions:\n"
  124. for ((i=1; i<=${VERSION}; i++)); do
  125. echo $i - $(ts2date $(cat "$HDIR/$i"))
  126. done
  127. echo -ne "\nEnter number of version A: [1] "
  128. read A
  129. echo -n "Enter number of version B: [${VERSION}] "
  130. read B
  131. fi
  132. [ -z $A ] && A="1";
  133. [ -z $B ] && B="$VERSION"
  134. for URL in $(cat "$INPUTFILE"); do
  135. echo -e "\n$URL\n - Version A: "$(ts2date $(cat "$HDIR/$A"))"\n + Version B: "$(ts2date $(cat "$HDIR/$B"))
  136. URLHASH=$($MD5 <<<"$URL" | tr -d " -")
  137. $CMD "$HDIR/$URLHASH.$A" "$HDIR/$URLHASH.$B"
  138. done
  139. fi
  140.  
  141. exit 0

Report this snippet  

You need to login to post a comment.