/ Published in: C++
Expand |
Embed | Plain Text
Copy this code and paste it in your HTML
//============================================================================= // // SVD Sample Code // // Copyright (C) 2007 Timely Development (www.timelydevelopment.com) // // Special thanks to Simon Funk and others from the Netflix Prize contest // for providing pseudo-code and tuning hints. // // Feel free to use this code as you wish as long as you include // these notices and attribution. // // Also, if you have alternative types of algorithms for accomplishing // the same goal and would like to contribute, please share them as well :) // // STANDARD DISCLAIMER: // // - THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY // - OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT // - LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR // - FITNESS FOR A PARTICULAR PURPOSE. // //============================================================================= #define WIN32_LEAN_AND_MEAN #include <windows.h> #include <stdio.h> #include <math.h> #include <tchar.h> #include <map> using namespace std; //=================================================================== // // Constants and Type Declarations // //=================================================================== #define TRAINING_PATH L"C: etflix raining_set*.txt" #define TRAINING_FILE L"C: etflix raining_set\%s" #define FEATURE_FILE L"C: etflixfeatures.txt" #define TEST_PATH L"C: etflix\%s" #define PREDICTION_FILE L"C: etflixprediction.txt" #define MAX_RATINGS 100480508 // Ratings in entire training set (+1) #define MAX_CUSTOMERS 480190 // Customers in the entire training set (+1) #define MAX_MOVIES 17771 // Movies in the entire training set (+1) #define MAX_FEATURES 64 // Number of features to use #define MIN_EPOCHS 120 // Minimum number of epochs per feature #define MAX_EPOCHS 200 // Max epochs per feature #define MIN_IMPROVEMENT 0.0001 // Minimum improvement required to continue current feature #define INIT 0.1 // Initialization value for features #define LRATE 0.001 // Learning rate parameter #define K 0.015 // Regularization parameter used to minimize over-fitting typedef unsigned char BYTE; typedef map<int, int> IdMap; typedef IdMap::iterator IdItr; struct Movie { int RatingCount; int RatingSum; double RatingAvg; double PseudoAvg; // Weighted average used to deal with small movie counts }; struct Customer { int CustomerId; int RatingCount; int RatingSum; }; struct Data { int CustId; short MovieId; BYTE Rating; float Cache; }; class Engine { private: int m_nRatingCount; // Current number of loaded ratings Data m_aRatings[MAX_RATINGS]; // Array of ratings data Movie m_aMovies[MAX_MOVIES]; // Array of movie metrics Customer m_aCustomers[MAX_CUSTOMERS]; // Array of customer metrics float m_aMovieFeatures[MAX_FEATURES][MAX_MOVIES]; // Array of features by movie (using floats to save space) float m_aCustFeatures[MAX_FEATURES][MAX_CUSTOMERS]; // Array of features by customer (using floats to save space) IdMap m_mCustIds; // Map for one time translation of ids to compact array index inline double PredictRating(short movieId, int custId, int feature, float cache, bool bTrailing=true); inline double PredictRating(short movieId, int custId); bool ReadNumber(wchar_t* pwzBufferIn, int nLength, int &nPosition, wchar_t* pwzBufferOut); bool ParseInt(wchar_t* pwzBuffer, int nLength, int &nPosition, int& nValue); bool ParseFloat(wchar_t* pwzBuffer, int nLength, int &nPosition, float& fValue); public: Engine(void); ~Engine(void) { }; void CalcMetrics(); void CalcFeatures(); void LoadHistory(); void ProcessTest(wchar_t* pwzFile); void ProcessFile(wchar_t* pwzFile); }; //=================================================================== // // Program Main // //=================================================================== int _tmain(int argc, _TCHAR* argv[]) { Engine* engine = new Engine(); engine->LoadHistory(); engine->CalcMetrics(); engine->CalcFeatures(); engine->ProcessTest(L"qualifying.txt"); wprintf(L" Done "); getchar(); return 0; } //=================================================================== // // Engine Class // //=================================================================== //------------------------------------------------------------------- // Initialization //------------------------------------------------------------------- Engine::Engine(void) { m_nRatingCount = 0; for (int f=0; f<MAX_FEATURES; f++) { for (int i=0; i<MAX_MOVIES; i++) m_aMovieFeatures[f][i] = (float)INIT; for (int i=0; i<MAX_CUSTOMERS; i++) m_aCustFeatures[f][i] = (float)INIT; } } //------------------------------------------------------------------- // Calculations - This Paragraph contains all of the relevant code //------------------------------------------------------------------- // // CalcMetrics // - Loop through the history and pre-calculate metrics used in the training // - Also re-number the customer id's to fit in a fixed array // void Engine::CalcMetrics() { int i, cid; IdItr itr; wprintf(L" Calculating intermediate metrics "); // Process each row in the training set for (i=0; i<m_nRatingCount; i++) { Data* rating = m_aRatings + i; // Increment movie stats m_aMovies[rating->MovieId].RatingCount++; m_aMovies[rating->MovieId].RatingSum += rating->Rating; // Add customers (using a map to re-number id's to array indexes) itr = m_mCustIds.find(rating->CustId); if (itr == m_mCustIds.end()) { cid = 1 + (int)m_mCustIds.size(); // Reserve new id and add lookup m_mCustIds[rating->CustId] = cid; // Store off old sparse id for later m_aCustomers[cid].CustomerId = rating->CustId; // Init vars to zero m_aCustomers[cid].RatingCount = 0; m_aCustomers[cid].RatingSum = 0; } else { cid = itr->second; } // Swap sparse id for compact one rating->CustId = cid; m_aCustomers[cid].RatingCount++; m_aCustomers[cid].RatingSum += rating->Rating; } // Do a follow-up loop to calc movie averages for (i=0; i<MAX_MOVIES; i++) { Movie* movie = m_aMovies+i; movie->RatingAvg = movie->RatingSum / (1.0 * movie->RatingCount); movie->PseudoAvg = (3.23 * 25 + movie->RatingSum) / (25.0 + movie->RatingCount); } } // // CalcFeatures // - Iteratively train each feature on the entire data set // - Once sufficient progress has been made, move on // void Engine::CalcFeatures() { int f, e, i, custId, cnt = 0; Data* rating; double err, p, sq, rmse_last, rmse = 2.0; short movieId; float cf, mf; for (f=0; f<MAX_FEATURES; f++) { wprintf(L" --- Calculating feature: %d --- ", f); // Keep looping until you have passed a minimum number // of epochs or have stopped making significant progress for (e=0; (e < MIN_EPOCHS) || (rmse <= rmse_last - MIN_IMPROVEMENT); e++) { cnt++; sq = 0; rmse_last = rmse; for (i=0; i<m_nRatingCount; i++) { rating = m_aRatings + i; movieId = rating->MovieId; custId = rating->CustId; // Predict rating and calc error p = PredictRating(movieId, custId, f, rating->Cache, true); err = (1.0 * rating->Rating - p); sq += err*err; // Cache off old feature values cf = m_aCustFeatures[f][custId]; mf = m_aMovieFeatures[f][movieId]; // Cross-train the features m_aCustFeatures[f][custId] += (float)(LRATE * (err * mf - K * cf)); m_aMovieFeatures[f][movieId] += (float)(LRATE * (err * cf - K * mf)); } rmse = sqrt(sq/m_nRatingCount); wprintf(L" <set x='%d' y='%f' /> ",cnt,rmse); } // Cache off old predictions for (i=0; i<m_nRatingCount; i++) { rating = m_aRatings + i; rating->Cache = (float)PredictRating(rating->MovieId, rating->CustId, f, rating->Cache, false); } } } // // PredictRating // - During training there is no need to loop through all of the features // - Use a cache for the leading features and do a quick calculation for the trailing // - The trailing can be optionally removed when calculating a new cache value // double Engine::PredictRating(short movieId, int custId, int feature, float cache, bool bTrailing) { // Get cached value for old features or default to an average double sum = (cache > 0) ? cache : 1; //m_aMovies[movieId].PseudoAvg; // Add contribution of current feature sum += m_aMovieFeatures[feature][movieId] * m_aCustFeatures[feature][custId]; if (sum > 5) sum = 5; if (sum < 1) sum = 1; // Add up trailing defaults values if (bTrailing) { sum += (MAX_FEATURES-feature-1) * (INIT * INIT); if (sum > 5) sum = 5; if (sum < 1) sum = 1; } return sum; } // // PredictRating // - This version is used for calculating the final results // - It loops through the entire list of finished features // double Engine::PredictRating(short movieId, int custId) { double sum = 1; //m_aMovies[movieId].PseudoAvg; for (int f=0; f<MAX_FEATURES; f++) { sum += m_aMovieFeatures[f][movieId] * m_aCustFeatures[f][custId]; if (sum > 5) sum = 5; if (sum < 1) sum = 1; } return sum; } //------------------------------------------------------------------- // Data Loading / Saving //------------------------------------------------------------------- // // LoadHistory // - Loop through all of the files in the training directory // void Engine::LoadHistory() { WIN32_FIND_DATA FindFileData; HANDLE hFind; bool bContinue = true; int count = 0; // TEST // Loop through all of the files in the training directory hFind = FindFirstFile(TRAINING_PATH, &FindFileData); if (hFind == INVALID_HANDLE_VALUE) return; while (bContinue) { this->ProcessFile(FindFileData.cFileName); bContinue = (FindNextFile(hFind, &FindFileData) != 0); //if (++count > 999) break; // TEST: Uncomment to only test with the first X movies } FindClose(hFind); } // // ProcessFile // - Load a history file in the format: // // <MovieId>: // <CustomerId>,<Rating> // <CustomerId>,<Rating> // ... void Engine::ProcessFile(wchar_t* pwzFile) { FILE *stream; wchar_t pwzBuffer[1000]; wsprintf(pwzBuffer,TRAINING_FILE,pwzFile); int custId, movieId, rating, pos = 0; wprintf(L"Processing file: %s ", pwzBuffer); if (_wfopen_s(&stream, pwzBuffer, L"r") != 0) return; // First line is the movie id fgetws(pwzBuffer, 1000, stream); ParseInt(pwzBuffer, (int)wcslen(pwzBuffer), pos, movieId); m_aMovies[movieId].RatingCount = 0; m_aMovies[movieId].RatingSum = 0; // Get all remaining rows fgetws(pwzBuffer, 1000, stream); while ( !feof( stream ) ) { pos = 0; ParseInt(pwzBuffer, (int)wcslen(pwzBuffer), pos, custId); ParseInt(pwzBuffer, (int)wcslen(pwzBuffer), pos, rating); m_aRatings[m_nRatingCount].MovieId = (short)movieId; m_aRatings[m_nRatingCount].CustId = custId; m_aRatings[m_nRatingCount].Rating = (BYTE)rating; m_aRatings[m_nRatingCount].Cache = 0; m_nRatingCount++; fgetws(pwzBuffer, 1000, stream); } // Cleanup fclose( stream ); } // // ProcessTest // - Load a sample set in the following format // // <Movie1Id>: // <CustomerId> // <CustomerId> // ... // <Movie2Id>: // <CustomerId> // // - And write results: // // <Movie1Id>: // <Rating> // <Raing> // ... void Engine::ProcessTest(wchar_t* pwzFile) { FILE *streamIn, *streamOut; wchar_t pwzBuffer[1000]; int custId, movieId, pos = 0; double rating; bool bMovieRow; wsprintf(pwzBuffer, TEST_PATH, pwzFile); wprintf(L" Processing test: %s ", pwzBuffer); if (_wfopen_s(&streamIn, pwzBuffer, L"r") != 0) return; if (_wfopen_s(&streamOut, PREDICTION_FILE, L"w") != 0) return; fgetws(pwzBuffer, 1000, streamIn); while ( !feof( streamIn ) ) { bMovieRow = false; for (int i=0; i<(int)wcslen(pwzBuffer); i++) { bMovieRow |= (pwzBuffer[i] == 58); } pos = 0; if (bMovieRow) { ParseInt(pwzBuffer, (int)wcslen(pwzBuffer), pos, movieId); // Write same row to results fputws(pwzBuffer,streamOut); } else { ParseInt(pwzBuffer, (int)wcslen(pwzBuffer), pos, custId); custId = m_mCustIds[custId]; rating = PredictRating(movieId, custId); // Write predicted value swprintf(pwzBuffer,1000,L"%5.3f ",rating); fputws(pwzBuffer,streamOut); } //wprintf(L"Got Line: %d %d %d ", movieId, custId, rating); fgetws(pwzBuffer, 1000, streamIn); } // Cleanup fclose( streamIn ); fclose( streamOut ); } //------------------------------------------------------------------- // Helper Functions //------------------------------------------------------------------- bool Engine::ReadNumber(wchar_t* pwzBufferIn, int nLength, int &nPosition, wchar_t* pwzBufferOut) { int count = 0; int start = nPosition; wchar_t wc = 0; // Find start of number while (start < nLength) { wc = pwzBufferIn[start]; if ((wc >= 48 && wc <= 57) || (wc == 45)) break; start++; } // Copy each character into the output buffer nPosition = start; while (nPosition < nLength && ((wc >= 48 && wc <= 57) || wc == 69 || wc == 101 || wc == 45 || wc == 46)) { pwzBufferOut[count++] = wc; wc = pwzBufferIn[++nPosition]; } // Null terminate and return pwzBufferOut[count] = 0; return (count > 0); } bool Engine::ParseFloat(wchar_t* pwzBuffer, int nLength, int &nPosition, float& fValue) { wchar_t pwzNumber[20]; bool bResult = ReadNumber(pwzBuffer, nLength, nPosition, pwzNumber); fValue = (bResult) ? (float)_wtof(pwzNumber) : 0; return false; } bool Engine::ParseInt(wchar_t* pwzBuffer, int nLength, int &nPosition, int& nValue) { wchar_t pwzNumber[20]; bool bResult = ReadNumber(pwzBuffer, nLength, nPosition, pwzNumber); nValue = (bResult) ? _wtoi(pwzNumber) : 0; return bResult; }
URL: http://www.timelydevelopment.com/demos/NetflixPrize.aspx