Posted By

Firsh on 04/23/15


Tagged

YouTube


Versions (?)

Who likes this?

1 person have marked this snippet as a favorite

kashif


Improved YouTube scrapers


 / Published in: PHP
 

URL: http://justifiedgrid.com/

http://stackoverflow.com/questions/29752447/how-to-get-a-youtube-channel-rss-feed-after-2015-april-20-without-v3-api

  1. function scrape_youtube($rss_url, $limit){
  2. $limit = $limit === 0 ? -1 : $limit;
  3. if(stripos($rss_url, 'gdata.youtube.com') !== false || stripos($rss_url, 'youtube.com/user/') !== false){
  4. return $this->scrape_youtube_channel($rss_url, $limit);
  5. }elseif(stripos($rss_url, 'list=') !== false){
  6. return $this->scrape_youtube_playlist($rss_url, $limit);
  7. }else{
  8. return __('YouTube source could not be determined.', 'jig_td');
  9. }
  10. }
  11. function scrape_youtube_playlist($rss_url, $limit){
  12. if (preg_match('/(?<=list=)[^&#?\s]*/im', $rss_url, $regs)) {
  13. $url = "https://www.youtube.com/playlist?list=".$regs[0]."&hl=en";
  14. }else{
  15. return __('YouTube playlist ID could not be determined.', 'jig_td');
  16. }
  17.  
  18.  
  19. $host = !is_ssl() ? 'http://www.youtube.com' : 'https://www.youtube.com';
  20. //$author = preg_replace('#^(https?://[^/])/user/([^/]+).*#', '$1', $url);
  21.  
  22. $html = $this->file_get_contents_curl($url);
  23. $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
  24. $doc = new DOMDocument();
  25. @$doc->loadHTML($html);
  26. $xpath = new DOMXpath($doc);
  27.  
  28. $videos = $xpath->query('//tr[contains(concat(" ", normalize-space(@class), " "), " yt-uix-tile ")]');
  29. $rss_items = array();
  30. $count = 0;
  31. if (!empty($videos)) {
  32. foreach ($videos as $video) {
  33. if($count == $limit){
  34. break;
  35. }
  36. $anchor = $xpath->query('.//a[contains(concat(" ", normalize-space(@class), " "), " yt-uix-tile-link ")][starts-with(@href, "/watch")]',$video)->item(0);
  37.  
  38. $ownerAnchor = $xpath->query('.//div[contains(concat(" ", normalize-space(@class), " "), " pl-video-owner ")]/a',$video)->item(0);
  39. $rss_item = new JIGstdClass();
  40.  
  41.  
  42. $rss_item->get_title = trim($anchor->nodeValue);
  43. if($rss_item->get_title == "[Private Video]" || $rss_item->get_title == "[Deleted Video]"){
  44. continue;
  45. }
  46. $rss_item->get_description = (!empty($ownerAnchor) ? __('by','jig_td').' <a href="'.$host.$ownerAnchor->getAttribute('href').'" target="_blank">'.trim($ownerAnchor->nodeValue).'</a>' : '');
  47. $rss_item->get_date = __("No date available.","jig_td");
  48. $rss_item->get_enclosures = array();
  49. $rss_item->get_enclosures[] = new JIGstdClass();
  50. $rss_item->get_enclosures[0]->get_link = str_replace('/default.jpg', '/maxresdefault.jpg', $xpath->query('.//img',$video)->item(0)->getAttribute('data-thumb'));
  51.  
  52. $rss_item->get_permalink = $host.$anchor->getAttribute('href');
  53.  
  54. $rss_items[] = $rss_item;
  55. $count++;
  56. }
  57. }
  58. return $rss_items;
  59. }
  60.  
  61.  
  62. function scrape_youtube_channel($rss_url, $limit){
  63.  
  64. //http://gdata.youtube.com/feeds/base/users/MAKO0MAKO0/uploads?max-results=50
  65. if (preg_match('%(?<=/feeds/base/users/).*(?=/)%im', $rss_url, $regs)) {
  66. $url = "https://www.youtube.com/user/".$regs[0]."/videos?flow=list&sort=dd&hl=en";
  67. }elseif(preg_match('%(?<=youtube\.com/user/)[^/]*%im', $rss_url, $regs)) {
  68. $url = "https://www.youtube.com/user/".$regs[0]."/videos?flow=list&sort=dd&hl=en";
  69. }else{
  70. return __('YouTube username could not be determined.', 'jig_td');
  71. }
  72.  
  73. $host = !is_ssl() ? 'http://www.youtube.com' : 'https://www.youtube.com';
  74. //$author = preg_replace('#^(https?://[^/])/user/([^/]+).*#', '$1', $url);
  75.  
  76. $html = $this->file_get_contents_curl($url);
  77. $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
  78. $doc = new DOMDocument();
  79. @$doc->loadHTML($html);
  80. $xpath = new DOMXpath($doc);
  81.  
  82. $videos = $xpath->query('//li[contains(concat(" ", normalize-space(@class), " "), " feed-item-container ")]');
  83. $rss_items = array();
  84. $count = 0;
  85.  
  86. if (!empty($videos)) {
  87. foreach ($videos as $video) {
  88. if($count == $limit){
  89. break;
  90. }
  91. $anchor = $xpath->query('.//a[contains(concat(" ", normalize-space(@class), " "), " yt-uix-tile-link ")][starts-with(@href, "/watch")]',$video)->item(0);
  92.  
  93. $rss_item = new JIGstdClass();
  94.  
  95.  
  96. $rss_item->get_title = trim($anchor->getAttribute('title'));
  97. $rss_item->get_description = trim($xpath->query('.//div[contains(concat(" ", normalize-space(@class), " "), " yt-lockup-description ")]',$video)->item(0)->nodeValue);
  98. $rss_item->get_date = $xpath->query('.//ul[contains(concat(" ", normalize-space(@class), " "), " yt-lockup-meta-info ")]/*[1]',$video)->item(0)->nodeValue;
  99. $rss_item->get_enclosures = array();
  100. $rss_item->get_enclosures[] = new JIGstdClass();
  101. $rss_item->get_enclosures[0]->get_link = str_replace('/mqdefault.jpg', '/maxresdefault.jpg', $xpath->query('.//img',$video)->item(0)->getAttribute('data-thumb'));
  102.  
  103. $rss_item->get_permalink = $host.$anchor->getAttribute('href');
  104.  
  105. $rss_items[] = $rss_item;
  106. $count++;
  107. }
  108. }
  109. return $rss_items;
  110.  
  111. }

Report this snippet  

You need to login to post a comment.