Revision: 6084
Initial Code
Initial URL
Initial Description
Initial Title
Initial Tags
Initial Language
at April 26, 2008 16:19 by mandric
Initial Code
#!/bin/sh
#
# scrape_yahoo.sh
#
# This script pulls data from yahoo using a NYSE index list generated
# from scrape_nyse.sh. It iterates through the list, saving data from each
# security in one big log and seperate per-security log files. This stuff
# should go into a database, sooner or later.
DEBUG=0
PATH=$PATH:/usr/local/bin
BASEDIR=/home/stockh/stockharmony.com/api/scripts/
SYMBOLS_FILE=${BASEDIR}../data/nyse_index_symbols.txt
LAST_SYMBOL=`tail -1 $SYMBOLS_FILE`
# the SYMBOL string used in a request to yahoo
SYMBOLS=''
# took out "6t" (the URL)
ARG_STRING=`tr -d '\n' < ${BASEDIR}../data/yahoo_arg_string_custom.txt `
YAHOO_URL='http://finance.yahoo.com/d/quotes.csv?s='
COUNT=0
# how many symbols should we query in one request?
GET_SYMBOLS=15
SLEEP_TIME=60
BEGIN_TIME=`date +%Y%m%d%H%M%S | tr -d '\n'`
DATA_FILE=${BASEDIR}../data/yahoo-finance-${BEGIN_TIME}.csv
TMP_FILE=/tmp/yahoo-data
LOGFILE=${BASEDIR}../logs/scraping
SECS_DIR=${BASEDIR}../data/securities/
GET_YAHOO=0
SELF=`basename $0`
PIDFILE=${BASEDIR}../logs/${SELF}.pid
ERROR_STRING='default error string'
# define functions first, put in include file later
# send sms msg, only once
send_sms_msg () {
if [ "$1" ]; then STRING=$1; fi
if [ $SMS_SENT ]; then
return 0
else
echo $STRING | mailx -s 'stockh error' [email protected]
SMS_SENT=1
fi
}
log_error () {
if [ "$1" ]; then
STRING="OOPS: return code $? because $1"
else
STRING="OOPS: return code $?"
fi
date >> $LOGFILE
echo $STRING >> $LOGFILE
}
log_normal () {
if [ "$1" ]; then
STRING="OK: $1"
else
STRING="OK: seems ok $?"
fi
date >> $LOGFILE
echo $STRING >> $LOGFILE
}
save_symbol_data () {
# save once in main file
cat $TMP_FILE >> $DATA_FILE 2>> $LOGFILE
# grep for each symbol and save in seperate files
for j in $SYMBOLS; do
echo -n ${GOT_WHEN}, >> ${SECS_DIR}$j.csv
match=",\"$j\","
grep -i $match $TMP_FILE >> ${SECS_DIR}$j.csv 2>> $LOGFILE
done
}
# Look if any symbols have changed in TMP_FILE, get that data too, append to TMP_FILE.
fetch_changed_symbol_data () {
TMP=`grep '"Ticker symbol has changed to:' $TMP_FILE | sed 's/.*changed to: <a href="\/q?s=\(.*\)">.*/\1/'`
TMP=`echo $TMP | tr -d '\n'`
TMP=`echo $TMP | sed 's/^\s+//g'`
NEW=$TMP
if [ "$NEW" ]; then
log_normal "got changed symbols: $NEW"
# save new syms to global var $SYMBOLS
SYMBOLS="${SYMBOLS} ${NEW}"
TMP=`echo $NEW | tr ' ' '+'`
URL="${YAHOO_URL}${TMP}&f=$ARG_STRING"
# append data to tmp file
lynx -dump $URL >> $TMP_FILE 2>> $LOGFILE
# report errors
if [ $? = 0 ]; then
if [ $DEBUG ]; then log_normal "$URL" ; fi
else
log_error "lynx failed getting changed symbols $URL"
send_sms_msg "lynx failed getting $NEW"
fi
fi
}
# take space seperate list of symbols, query yahoo and save to TMP_FILE
fetch_and_save_symbol_data () {
# replace space with + for URL
TMP=`echo $SYMBOLS | tr ' ' '+'`
URL="${YAHOO_URL}${TMP}&f=$ARG_STRING"
# clobber TMP_FILE with new data
lynx -dump $URL > $TMP_FILE 2>> $LOGFILE
if [ $? = 0 ]; then
if [ $DEBUG ]; then log_normal "$URL" ; fi
GOT_WHEN=`date +%Y%m%d%H%M%S | tr -d '\n'`
fetch_changed_symbol_data
# regardless of fetch_changed_symbol_data always save symbol data at
# this point
save_symbol_data
else
log_error "lynx failed getting $URL"
send_sms_msg "lynx failed on $TMP"
fi
}
if [ -f $PIDFILE ]; then
send_sms_msg "$SELF exiting, PID exists"
log_error "$SELF exiting, PID exists"
exit 1
fi
echo $$ > $PIDFILE 2>> $LOGFILE
# save $COUNT amount of symbols in $SYMBOLS then call functions
for i in `cat $SYMBOLS_FILE`; do
COUNT=`expr $COUNT + 1`
if [ "$SYMBOLS" ]; then
SYMBOLS="$SYMBOLS $i"
else
SYMBOLS=$i
fi
if [ $COUNT = $GET_SYMBOLS ]; then
GET_YAHOO=1
elif [ $i = $LAST_SYMBOL ]; then
GET_YAHOO=1
fi
if [ $GET_YAHOO = 1 ]; then
if [ $DEBUG ]; then log_normal "SYMBOLS are $SYMBOLS"; fi
fetch_and_save_symbol_data
sleep $SLEEP_TIME
SYMBOLS=''
COUNT=0
GET_YAHOO=0;
fi
done
echo $SELF started at $BEGIN_TIME >> $LOGFILE 2>&1
echo $SELF finished on `date` >> $LOGFILE 2>&1
wc -l $DATA_FILE >> $LOGFILE 2>&1
wc -l $SYMBOLS_FILE >> $LOGFILE 2>&1
rm -f $PIDFILE >> $LOGFILE 2>&1
if [ $? = 0 ]; then
exit 0
else
send_sms_msg "could not remove PIDFILE. rm returned $?"
exit 1
fi
Initial URL
Initial Description
Not sure why I'm posting this madness. Bash can be good for some things though. I think part of the reason I used bash was because I was calling alot of external commands.
Initial Title
Scrape Yahoo Finance
Initial Tags
Bash, Business
Initial Language
Bash