SVD Algorithm Used For One Implementation of the Netflix Prize (not the winning one)


/ Published in: C++
Save to your folder(s)



Copy this code and paste it in your HTML
  1. //=============================================================================
  2. //
  3. // SVD Sample Code
  4. //
  5. // Copyright (C) 2007 Timely Development (www.timelydevelopment.com)
  6. //
  7. // Special thanks to Simon Funk and others from the Netflix Prize contest
  8. // for providing pseudo-code and tuning hints.
  9. //
  10. // Feel free to use this code as you wish as long as you include
  11. // these notices and attribution.
  12. //
  13. // Also, if you have alternative types of algorithms for accomplishing
  14. // the same goal and would like to contribute, please share them as well :)
  15. //
  16. // STANDARD DISCLAIMER:
  17. //
  18. // - THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY
  19. // - OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT
  20. // - LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR
  21. // - FITNESS FOR A PARTICULAR PURPOSE.
  22. //
  23. //=============================================================================
  24.  
  25. #define WIN32_LEAN_AND_MEAN
  26. #include <windows.h>
  27. #include <stdio.h>
  28. #include <math.h>
  29. #include <tchar.h>
  30. #include <map>
  31. using namespace std;
  32.  
  33. //===================================================================
  34. //
  35. // Constants and Type Declarations
  36. //
  37. //===================================================================
  38. #define TRAINING_PATH L"C:
  39. etflix raining_set*.txt"
  40. #define TRAINING_FILE L"C:
  41. etflix raining_set\%s"
  42. #define FEATURE_FILE L"C:
  43. etflixfeatures.txt"
  44. #define TEST_PATH L"C:
  45. etflix\%s"
  46. #define PREDICTION_FILE L"C:
  47. etflixprediction.txt"
  48.  
  49. #define MAX_RATINGS 100480508 // Ratings in entire training set (+1)
  50. #define MAX_CUSTOMERS 480190 // Customers in the entire training set (+1)
  51. #define MAX_MOVIES 17771 // Movies in the entire training set (+1)
  52. #define MAX_FEATURES 64 // Number of features to use
  53. #define MIN_EPOCHS 120 // Minimum number of epochs per feature
  54. #define MAX_EPOCHS 200 // Max epochs per feature
  55.  
  56. #define MIN_IMPROVEMENT 0.0001 // Minimum improvement required to continue current feature
  57. #define INIT 0.1 // Initialization value for features
  58. #define LRATE 0.001 // Learning rate parameter
  59. #define K 0.015 // Regularization parameter used to minimize over-fitting
  60.  
  61. typedef unsigned char BYTE;
  62. typedef map<int, int> IdMap;
  63. typedef IdMap::iterator IdItr;
  64.  
  65. struct Movie
  66. {
  67. int RatingCount;
  68. int RatingSum;
  69. double RatingAvg;
  70. double PseudoAvg; // Weighted average used to deal with small movie counts
  71. };
  72.  
  73. struct Customer
  74. {
  75. int CustomerId;
  76. int RatingCount;
  77. int RatingSum;
  78. };
  79.  
  80. struct Data
  81. {
  82. int CustId;
  83. short MovieId;
  84. BYTE Rating;
  85. float Cache;
  86. };
  87.  
  88. class Engine
  89. {
  90. private:
  91. int m_nRatingCount; // Current number of loaded ratings
  92. Data m_aRatings[MAX_RATINGS]; // Array of ratings data
  93. Movie m_aMovies[MAX_MOVIES]; // Array of movie metrics
  94. Customer m_aCustomers[MAX_CUSTOMERS]; // Array of customer metrics
  95. float m_aMovieFeatures[MAX_FEATURES][MAX_MOVIES]; // Array of features by movie (using floats to save space)
  96. float m_aCustFeatures[MAX_FEATURES][MAX_CUSTOMERS]; // Array of features by customer (using floats to save space)
  97. IdMap m_mCustIds; // Map for one time translation of ids to compact array index
  98.  
  99. inline double PredictRating(short movieId, int custId, int feature, float cache, bool bTrailing=true);
  100. inline double PredictRating(short movieId, int custId);
  101.  
  102. bool ReadNumber(wchar_t* pwzBufferIn, int nLength, int &nPosition, wchar_t* pwzBufferOut);
  103. bool ParseInt(wchar_t* pwzBuffer, int nLength, int &nPosition, int& nValue);
  104. bool ParseFloat(wchar_t* pwzBuffer, int nLength, int &nPosition, float& fValue);
  105.  
  106. public:
  107. Engine(void);
  108. ~Engine(void) { };
  109.  
  110. void CalcMetrics();
  111. void CalcFeatures();
  112. void LoadHistory();
  113. void ProcessTest(wchar_t* pwzFile);
  114. void ProcessFile(wchar_t* pwzFile);
  115. };
  116.  
  117.  
  118. //===================================================================
  119. //
  120. // Program Main
  121. //
  122. //===================================================================
  123. int _tmain(int argc, _TCHAR* argv[])
  124. {
  125. Engine* engine = new Engine();
  126.  
  127. engine->LoadHistory();
  128. engine->CalcMetrics();
  129. engine->CalcFeatures();
  130. engine->ProcessTest(L"qualifying.txt");
  131.  
  132. wprintf(L"
  133. Done
  134. ");
  135. getchar();
  136.  
  137. return 0;
  138. }
  139.  
  140.  
  141. //===================================================================
  142. //
  143. // Engine Class
  144. //
  145. //===================================================================
  146.  
  147. //-------------------------------------------------------------------
  148. // Initialization
  149. //-------------------------------------------------------------------
  150.  
  151. Engine::Engine(void)
  152. {
  153. m_nRatingCount = 0;
  154.  
  155. for (int f=0; f<MAX_FEATURES; f++)
  156. {
  157. for (int i=0; i<MAX_MOVIES; i++) m_aMovieFeatures[f][i] = (float)INIT;
  158. for (int i=0; i<MAX_CUSTOMERS; i++) m_aCustFeatures[f][i] = (float)INIT;
  159. }
  160. }
  161.  
  162. //-------------------------------------------------------------------
  163. // Calculations - This Paragraph contains all of the relevant code
  164. //-------------------------------------------------------------------
  165.  
  166. //
  167. // CalcMetrics
  168. // - Loop through the history and pre-calculate metrics used in the training
  169. // - Also re-number the customer id's to fit in a fixed array
  170. //
  171. void Engine::CalcMetrics()
  172. {
  173. int i, cid;
  174. IdItr itr;
  175.  
  176. wprintf(L"
  177. Calculating intermediate metrics
  178.  
  179. ");
  180.  
  181. // Process each row in the training set
  182. for (i=0; i<m_nRatingCount; i++)
  183. {
  184. Data* rating = m_aRatings + i;
  185.  
  186. // Increment movie stats
  187. m_aMovies[rating->MovieId].RatingCount++;
  188. m_aMovies[rating->MovieId].RatingSum += rating->Rating;
  189.  
  190. // Add customers (using a map to re-number id's to array indexes)
  191. itr = m_mCustIds.find(rating->CustId);
  192. if (itr == m_mCustIds.end())
  193. {
  194. cid = 1 + (int)m_mCustIds.size();
  195.  
  196. // Reserve new id and add lookup
  197. m_mCustIds[rating->CustId] = cid;
  198.  
  199. // Store off old sparse id for later
  200. m_aCustomers[cid].CustomerId = rating->CustId;
  201.  
  202. // Init vars to zero
  203. m_aCustomers[cid].RatingCount = 0;
  204. m_aCustomers[cid].RatingSum = 0;
  205. }
  206. else
  207. {
  208. cid = itr->second;
  209. }
  210.  
  211. // Swap sparse id for compact one
  212. rating->CustId = cid;
  213.  
  214. m_aCustomers[cid].RatingCount++;
  215. m_aCustomers[cid].RatingSum += rating->Rating;
  216. }
  217.  
  218. // Do a follow-up loop to calc movie averages
  219. for (i=0; i<MAX_MOVIES; i++)
  220. {
  221. Movie* movie = m_aMovies+i;
  222. movie->RatingAvg = movie->RatingSum / (1.0 * movie->RatingCount);
  223. movie->PseudoAvg = (3.23 * 25 + movie->RatingSum) / (25.0 + movie->RatingCount);
  224. }
  225. }
  226.  
  227. //
  228. // CalcFeatures
  229. // - Iteratively train each feature on the entire data set
  230. // - Once sufficient progress has been made, move on
  231. //
  232. void Engine::CalcFeatures()
  233. {
  234. int f, e, i, custId, cnt = 0;
  235. Data* rating;
  236. double err, p, sq, rmse_last, rmse = 2.0;
  237. short movieId;
  238. float cf, mf;
  239.  
  240. for (f=0; f<MAX_FEATURES; f++)
  241. {
  242. wprintf(L"
  243. --- Calculating feature: %d ---
  244. ", f);
  245.  
  246. // Keep looping until you have passed a minimum number
  247. // of epochs or have stopped making significant progress
  248. for (e=0; (e < MIN_EPOCHS) || (rmse <= rmse_last - MIN_IMPROVEMENT); e++)
  249. {
  250. cnt++;
  251. sq = 0;
  252. rmse_last = rmse;
  253.  
  254. for (i=0; i<m_nRatingCount; i++)
  255. {
  256. rating = m_aRatings + i;
  257. movieId = rating->MovieId;
  258. custId = rating->CustId;
  259.  
  260. // Predict rating and calc error
  261. p = PredictRating(movieId, custId, f, rating->Cache, true);
  262. err = (1.0 * rating->Rating - p);
  263. sq += err*err;
  264.  
  265. // Cache off old feature values
  266. cf = m_aCustFeatures[f][custId];
  267. mf = m_aMovieFeatures[f][movieId];
  268.  
  269. // Cross-train the features
  270. m_aCustFeatures[f][custId] += (float)(LRATE * (err * mf - K * cf));
  271. m_aMovieFeatures[f][movieId] += (float)(LRATE * (err * cf - K * mf));
  272. }
  273.  
  274. rmse = sqrt(sq/m_nRatingCount);
  275.  
  276. wprintf(L" <set x='%d' y='%f' />
  277. ",cnt,rmse);
  278. }
  279.  
  280. // Cache off old predictions
  281. for (i=0; i<m_nRatingCount; i++)
  282. {
  283. rating = m_aRatings + i;
  284. rating->Cache = (float)PredictRating(rating->MovieId, rating->CustId, f, rating->Cache, false);
  285. }
  286. }
  287. }
  288.  
  289. //
  290. // PredictRating
  291. // - During training there is no need to loop through all of the features
  292. // - Use a cache for the leading features and do a quick calculation for the trailing
  293. // - The trailing can be optionally removed when calculating a new cache value
  294. //
  295. double Engine::PredictRating(short movieId, int custId, int feature, float cache, bool bTrailing)
  296. {
  297. // Get cached value for old features or default to an average
  298. double sum = (cache > 0) ? cache : 1; //m_aMovies[movieId].PseudoAvg;
  299.  
  300. // Add contribution of current feature
  301. sum += m_aMovieFeatures[feature][movieId] * m_aCustFeatures[feature][custId];
  302. if (sum > 5) sum = 5;
  303. if (sum < 1) sum = 1;
  304.  
  305. // Add up trailing defaults values
  306. if (bTrailing)
  307. {
  308. sum += (MAX_FEATURES-feature-1) * (INIT * INIT);
  309. if (sum > 5) sum = 5;
  310. if (sum < 1) sum = 1;
  311. }
  312.  
  313. return sum;
  314. }
  315.  
  316. //
  317. // PredictRating
  318. // - This version is used for calculating the final results
  319. // - It loops through the entire list of finished features
  320. //
  321. double Engine::PredictRating(short movieId, int custId)
  322. {
  323. double sum = 1; //m_aMovies[movieId].PseudoAvg;
  324.  
  325. for (int f=0; f<MAX_FEATURES; f++)
  326. {
  327. sum += m_aMovieFeatures[f][movieId] * m_aCustFeatures[f][custId];
  328. if (sum > 5) sum = 5;
  329. if (sum < 1) sum = 1;
  330. }
  331.  
  332. return sum;
  333. }
  334.  
  335. //-------------------------------------------------------------------
  336. // Data Loading / Saving
  337. //-------------------------------------------------------------------
  338.  
  339. //
  340. // LoadHistory
  341. // - Loop through all of the files in the training directory
  342. //
  343. void Engine::LoadHistory()
  344. {
  345. WIN32_FIND_DATA FindFileData;
  346. HANDLE hFind;
  347. bool bContinue = true;
  348. int count = 0; // TEST
  349.  
  350. // Loop through all of the files in the training directory
  351. hFind = FindFirstFile(TRAINING_PATH, &FindFileData);
  352. if (hFind == INVALID_HANDLE_VALUE) return;
  353.  
  354. while (bContinue)
  355. {
  356. this->ProcessFile(FindFileData.cFileName);
  357. bContinue = (FindNextFile(hFind, &FindFileData) != 0);
  358.  
  359. //if (++count > 999) break; // TEST: Uncomment to only test with the first X movies
  360. }
  361.  
  362. FindClose(hFind);
  363. }
  364.  
  365. //
  366. // ProcessFile
  367. // - Load a history file in the format:
  368. //
  369. // <MovieId>:
  370. // <CustomerId>,<Rating>
  371. // <CustomerId>,<Rating>
  372. // ...
  373. void Engine::ProcessFile(wchar_t* pwzFile)
  374. {
  375. FILE *stream;
  376. wchar_t pwzBuffer[1000];
  377. wsprintf(pwzBuffer,TRAINING_FILE,pwzFile);
  378. int custId, movieId, rating, pos = 0;
  379.  
  380. wprintf(L"Processing file: %s
  381. ", pwzBuffer);
  382.  
  383. if (_wfopen_s(&stream, pwzBuffer, L"r") != 0) return;
  384.  
  385. // First line is the movie id
  386. fgetws(pwzBuffer, 1000, stream);
  387. ParseInt(pwzBuffer, (int)wcslen(pwzBuffer), pos, movieId);
  388. m_aMovies[movieId].RatingCount = 0;
  389. m_aMovies[movieId].RatingSum = 0;
  390.  
  391. // Get all remaining rows
  392. fgetws(pwzBuffer, 1000, stream);
  393. while ( !feof( stream ) )
  394. {
  395. pos = 0;
  396. ParseInt(pwzBuffer, (int)wcslen(pwzBuffer), pos, custId);
  397. ParseInt(pwzBuffer, (int)wcslen(pwzBuffer), pos, rating);
  398.  
  399. m_aRatings[m_nRatingCount].MovieId = (short)movieId;
  400. m_aRatings[m_nRatingCount].CustId = custId;
  401. m_aRatings[m_nRatingCount].Rating = (BYTE)rating;
  402. m_aRatings[m_nRatingCount].Cache = 0;
  403. m_nRatingCount++;
  404.  
  405. fgetws(pwzBuffer, 1000, stream);
  406. }
  407.  
  408. // Cleanup
  409. fclose( stream );
  410. }
  411.  
  412. //
  413. // ProcessTest
  414. // - Load a sample set in the following format
  415. //
  416. // <Movie1Id>:
  417. // <CustomerId>
  418. // <CustomerId>
  419. // ...
  420. // <Movie2Id>:
  421. // <CustomerId>
  422. //
  423. // - And write results:
  424. //
  425. // <Movie1Id>:
  426. // <Rating>
  427. // <Raing>
  428. // ...
  429. void Engine::ProcessTest(wchar_t* pwzFile)
  430. {
  431. FILE *streamIn, *streamOut;
  432. wchar_t pwzBuffer[1000];
  433. int custId, movieId, pos = 0;
  434. double rating;
  435. bool bMovieRow;
  436.  
  437. wsprintf(pwzBuffer, TEST_PATH, pwzFile);
  438. wprintf(L"
  439.  
  440. Processing test: %s
  441. ", pwzBuffer);
  442.  
  443. if (_wfopen_s(&streamIn, pwzBuffer, L"r") != 0) return;
  444. if (_wfopen_s(&streamOut, PREDICTION_FILE, L"w") != 0) return;
  445.  
  446. fgetws(pwzBuffer, 1000, streamIn);
  447. while ( !feof( streamIn ) )
  448. {
  449. bMovieRow = false;
  450. for (int i=0; i<(int)wcslen(pwzBuffer); i++)
  451. {
  452. bMovieRow |= (pwzBuffer[i] == 58);
  453. }
  454.  
  455. pos = 0;
  456. if (bMovieRow)
  457. {
  458. ParseInt(pwzBuffer, (int)wcslen(pwzBuffer), pos, movieId);
  459.  
  460. // Write same row to results
  461. fputws(pwzBuffer,streamOut);
  462. }
  463. else
  464. {
  465. ParseInt(pwzBuffer, (int)wcslen(pwzBuffer), pos, custId);
  466. custId = m_mCustIds[custId];
  467. rating = PredictRating(movieId, custId);
  468.  
  469. // Write predicted value
  470. swprintf(pwzBuffer,1000,L"%5.3f
  471. ",rating);
  472. fputws(pwzBuffer,streamOut);
  473. }
  474.  
  475. //wprintf(L"Got Line: %d %d %d
  476. ", movieId, custId, rating);
  477. fgetws(pwzBuffer, 1000, streamIn);
  478. }
  479.  
  480. // Cleanup
  481. fclose( streamIn );
  482. fclose( streamOut );
  483. }
  484.  
  485. //-------------------------------------------------------------------
  486. // Helper Functions
  487. //-------------------------------------------------------------------
  488. bool Engine::ReadNumber(wchar_t* pwzBufferIn, int nLength, int &nPosition, wchar_t* pwzBufferOut)
  489. {
  490. int count = 0;
  491. int start = nPosition;
  492. wchar_t wc = 0;
  493.  
  494. // Find start of number
  495. while (start < nLength)
  496. {
  497. wc = pwzBufferIn[start];
  498. if ((wc >= 48 && wc <= 57) || (wc == 45)) break;
  499. start++;
  500. }
  501.  
  502. // Copy each character into the output buffer
  503. nPosition = start;
  504. while (nPosition < nLength && ((wc >= 48 && wc <= 57) || wc == 69 || wc == 101 || wc == 45 || wc == 46))
  505. {
  506. pwzBufferOut[count++] = wc;
  507. wc = pwzBufferIn[++nPosition];
  508. }
  509.  
  510. // Null terminate and return
  511. pwzBufferOut[count] = 0;
  512. return (count > 0);
  513. }
  514.  
  515. bool Engine::ParseFloat(wchar_t* pwzBuffer, int nLength, int &nPosition, float& fValue)
  516. {
  517. wchar_t pwzNumber[20];
  518. bool bResult = ReadNumber(pwzBuffer, nLength, nPosition, pwzNumber);
  519. fValue = (bResult) ? (float)_wtof(pwzNumber) : 0;
  520. return false;
  521. }
  522.  
  523. bool Engine::ParseInt(wchar_t* pwzBuffer, int nLength, int &nPosition, int& nValue)
  524. {
  525. wchar_t pwzNumber[20];
  526. bool bResult = ReadNumber(pwzBuffer, nLength, nPosition, pwzNumber);
  527. nValue = (bResult) ? _wtoi(pwzNumber) : 0;
  528. return bResult;
  529. }

URL: http://www.timelydevelopment.com/demos/NetflixPrize.aspx

Report this snippet


Comments

RSS Icon Subscribe to comments

You need to login to post a comment.