Return to Snippet

Revision: 6084
at April 26, 2008 16:19 by mandric


Initial Code
#!/bin/sh 
#
# scrape_yahoo.sh
#
# This script pulls data from yahoo using a NYSE index list generated 
# from scrape_nyse.sh.  It iterates through the list, saving data from each 
# security in one big log and seperate per-security log files.  This stuff 
# should go into a database, sooner or later.

DEBUG=0
PATH=$PATH:/usr/local/bin
BASEDIR=/home/stockh/stockharmony.com/api/scripts/
SYMBOLS_FILE=${BASEDIR}../data/nyse_index_symbols.txt
LAST_SYMBOL=`tail -1 $SYMBOLS_FILE`
# the SYMBOL string used in a request to yahoo
SYMBOLS=''
# took out "6t" (the URL)
ARG_STRING=`tr -d '\n' < ${BASEDIR}../data/yahoo_arg_string_custom.txt `
YAHOO_URL='http://finance.yahoo.com/d/quotes.csv?s='
COUNT=0
# how many symbols should we query in one request?
GET_SYMBOLS=15
SLEEP_TIME=60
BEGIN_TIME=`date +%Y%m%d%H%M%S | tr -d '\n'` 
DATA_FILE=${BASEDIR}../data/yahoo-finance-${BEGIN_TIME}.csv
TMP_FILE=/tmp/yahoo-data
LOGFILE=${BASEDIR}../logs/scraping
SECS_DIR=${BASEDIR}../data/securities/
GET_YAHOO=0
SELF=`basename $0`
PIDFILE=${BASEDIR}../logs/${SELF}.pid
ERROR_STRING='default error string'

# define functions first, put in include file later

# send sms msg, only once
send_sms_msg () {
  if [ "$1" ]; then STRING=$1; fi
  if [ $SMS_SENT ]; then
    return 0
  else
    echo $STRING | mailx -s 'stockh error' [email protected]
    SMS_SENT=1
  fi
}

log_error () {
  if [ "$1" ]; then 
     STRING="OOPS: return code $? because $1" 
  else
     STRING="OOPS: return code $?"
  fi
  date >> $LOGFILE
  echo $STRING >> $LOGFILE
}

log_normal () {
  if [ "$1" ]; then 
     STRING="OK: $1" 
  else
     STRING="OK: seems ok $?"
  fi
  date >> $LOGFILE
  echo $STRING >> $LOGFILE
}

save_symbol_data () {
    # save once in main file
    cat $TMP_FILE >> $DATA_FILE 2>> $LOGFILE
    # grep for each symbol and save in seperate files
    for j in $SYMBOLS; do
      echo -n ${GOT_WHEN}, >> ${SECS_DIR}$j.csv
      match=",\"$j\","
      grep -i $match $TMP_FILE >> ${SECS_DIR}$j.csv 2>> $LOGFILE
    done
}

# Look if any symbols have changed in TMP_FILE, get that data too, append to TMP_FILE.
fetch_changed_symbol_data () {

  TMP=`grep '"Ticker symbol has changed to:' $TMP_FILE | sed 's/.*changed to: <a href="\/q?s=\(.*\)">.*/\1/'`
  TMP=`echo $TMP | tr -d '\n'`
  TMP=`echo $TMP | sed 's/^\s+//g'`
  NEW=$TMP
  if [ "$NEW" ]; then 
      log_normal "got changed symbols: $NEW"
      # save new syms to global var $SYMBOLS
      SYMBOLS="${SYMBOLS} ${NEW}"
      TMP=`echo $NEW | tr ' ' '+'`
      URL="${YAHOO_URL}${TMP}&f=$ARG_STRING"
      # append data to tmp file
      lynx -dump $URL >> $TMP_FILE 2>> $LOGFILE
      # report errors
      if [ $? = 0 ]; then
        if [ $DEBUG ]; then log_normal "$URL" ; fi
      else
        log_error "lynx failed getting changed symbols $URL"
        send_sms_msg "lynx failed getting $NEW"
      fi
  fi

}

# take space seperate list of symbols, query yahoo and save to TMP_FILE
fetch_and_save_symbol_data () {

  # replace space with + for URL
  TMP=`echo $SYMBOLS | tr ' ' '+'` 
  URL="${YAHOO_URL}${TMP}&f=$ARG_STRING"
  # clobber TMP_FILE with new data
  lynx -dump $URL > $TMP_FILE 2>> $LOGFILE
  if [ $? = 0 ]; then
    if [ $DEBUG ]; then log_normal "$URL" ; fi
    GOT_WHEN=`date +%Y%m%d%H%M%S | tr -d '\n'` 
    fetch_changed_symbol_data
    # regardless of fetch_changed_symbol_data always save symbol data at 
    # this point 
    save_symbol_data 
  else
    log_error "lynx failed getting $URL"
    send_sms_msg "lynx failed on $TMP"
  fi
}

if [ -f $PIDFILE ]; then
  send_sms_msg "$SELF exiting, PID exists"
  log_error "$SELF exiting, PID exists"
  exit 1
fi

echo $$ > $PIDFILE 2>> $LOGFILE

# save $COUNT amount of symbols in $SYMBOLS then call functions
for i in `cat $SYMBOLS_FILE`; do

  COUNT=`expr $COUNT + 1`

  if [ "$SYMBOLS" ]; then
    SYMBOLS="$SYMBOLS $i"
  else
    SYMBOLS=$i
  fi

  if [ $COUNT = $GET_SYMBOLS ]; then 
    GET_YAHOO=1
  elif [ $i = $LAST_SYMBOL ]; then
    GET_YAHOO=1
  fi

  if [ $GET_YAHOO = 1 ]; then 
    if [ $DEBUG ]; then log_normal "SYMBOLS are $SYMBOLS"; fi
    fetch_and_save_symbol_data 
    sleep $SLEEP_TIME
    SYMBOLS=''
    COUNT=0
    GET_YAHOO=0;
  fi

done


echo $SELF started at $BEGIN_TIME >> $LOGFILE  2>&1
echo $SELF finished on `date` >> $LOGFILE 2>&1
wc -l $DATA_FILE >> $LOGFILE 2>&1
wc -l $SYMBOLS_FILE >> $LOGFILE 2>&1
rm -f $PIDFILE >> $LOGFILE 2>&1

if [ $? = 0 ]; then
  exit 0
else
  send_sms_msg "could not remove PIDFILE. rm returned $?"
  exit 1
fi

Initial URL


Initial Description
Not sure why I'm posting this madness.  Bash can be good for some things though.  I think part of the reason I used bash was because I was calling alot of external commands.

Initial Title
Scrape Yahoo Finance

Initial Tags
Bash, Business

Initial Language
Bash