Posted By

diggernaut on 12/05/17


Tagged

data etl scraping extraction abercrombie diggernaut


Versions (?)

Scraping abercrombie.com with Diggernaut


 / Published in: Other
 

URL: https://www.diggernaut.com

This config can be used with diggernaut service to scrape abercrombie.com retrieve products information. Attention: you will need to use your own proxy for this digger as basic Diggernaut's proxies doesnt work with abercrombie.com

  1. You need to create free account at diggernaut.com
  2. Login to your account
  3. Create a project with any name and description you want
  4. Get into your new project by clicking it and create new digger with any name
  5. Then you will see 3 options suggested to you, you need to use one where you will use meta-language
  6. Config editor will open and you can simply copy and paste config code and click on save button.
  7. Run your digger.
  8. Wait for completion.
  9. Download data.
  10. Schedule your runs if required.
  1. ---
  2. config:
  3. debug: 2
  4. proxy: #USE YOUR PROXY HERE LIKE 1.1.1.1:8888
  5. agent: Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14
  6. do:
  7. - link_add:
  8. pool: main
  9. url:
  10. - https://www.abercrombie.com/shop/wd
  11. - https://www.abercrombie.com/shop/wd/kids
  12. - walk:
  13. to: links
  14. headers:
  15. Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
  16. Accept-Language: ru-RU,ru;q=0.8,en-US;q=0.6,en;q=0.4
  17. Cache-Control: no-cache
  18. Pragma: no-cache
  19. Proxy-Connection: keep-alive
  20. Upgrade-Insecure-Requests: 1
  21. pool: main
  22. do:
  23. - find:
  24. path: a.rs-nav__cat-label
  25. do:
  26. - parse:
  27. attr: href
  28. - space_dedupe
  29. - trim
  30. - if:
  31. match: ^\/shop
  32. do:
  33. - normalize:
  34. routine: url
  35. - link_add:
  36. pool: catalog
  37. - walk:
  38. to: links
  39. pool: catalog
  40. do:
  41. - variable_clear: section
  42. - find:
  43. path: li.rs-nav-item--minor>a
  44. do:
  45. - parse:
  46. attr: href
  47. - space_dedupe
  48. - trim
  49. - if:
  50. match: \w+
  51. do:
  52. - normalize:
  53. routine: url
  54. - if:
  55. match: hollisterco\.com
  56. else:
  57. - link_add:
  58. pool: catalog
  59. - find:
  60. path: div.breadcrumbs
  61. do:
  62. - parse:
  63. attr: data-categoryid
  64. - variable_set: section
  65. - set_converter:
  66. content_type: text/html
  67. converter: json
  68. - walk:
  69. to: https://www.abercrombie.com/webapp/wcs/stores/servlet/AjaxNavResults?storeId=11203&catalogId=10901&langId=-1&categoryId=<%section%>&start=0&quantity=10000&setCurrentPage=1
  70. do:
  71. - find:
  72. path: products>producturl
  73. do:
  74. - parse
  75. - if:
  76. match: \/p\/
  77. do:
  78. - link_add:
  79. pool: pages
  80. - find:
  81. path: products>swatches>swatchurl
  82. do:
  83. - parse
  84. - if:
  85. match: \/p\/
  86. do:
  87. - link_add:
  88. pool: pages
  89. - clear_converter:
  90. content_type: text/html
  91. - walk:
  92. to: links
  93. headers:
  94. Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
  95. Accept-Language: ru-RU,ru;q=0.8,en-US;q=0.6,en;q=0.4
  96. Cache-Control: no-cache
  97. Pragma: no-cache
  98. cookie: uPref=%7B%22cfi%22%3A%221%22%2C%22cur%22%3A%22USD%22%2C%22sf%22%3A%22US%22%7D; geoLocation=US:TX:;
  99. Proxy-Connection: keep-alive
  100. Upgrade-Insecure-Requests: 1
  101. pool: pages
  102. do:
  103. - sleep: 3
  104. - variable_clear: pageurl
  105. - find:
  106. path: body
  107. do:
  108. - static_get: url
  109. - variable_set: pageurl
  110. - find:
  111. path: 'section.product'
  112. do:
  113. - variable_clear: pid
  114. - variable_clear: prid
  115. - variable_clear: cid
  116. - variable_clear: sin
  117. - variable_clear: list
  118. - variable_clear: desc
  119. - variable_clear: price
  120. - object_new: product
  121. - find:
  122. path: div.details__web-item-number>span.number
  123. do:
  124. - parse
  125. - object_field_set:
  126. object: product
  127. field: sku
  128. - parse:
  129. attr: data-collection
  130. - variable_set: pid
  131. - parse:
  132. attr: data-seq
  133. - variable_set: cid
  134. - parse:
  135. attr: data-productid
  136. - variable_set: prid
  137. - eval:
  138. routine: js
  139. body: '(function (){var d = new Date(); return d.toISOString()})();'
  140. - object_field_set:
  141. object: product
  142. field: date
  143. - variable_get: pageurl
  144. - object_field_set:
  145. object: product
  146. field: url
  147. - find:
  148. path: .product-page-title[itemprop="name"]
  149. do:
  150. - parse
  151. - space_dedupe
  152. - trim
  153. - object_field_set:
  154. object: product
  155. field: name
  156. - register_set: 'Abercrombie & Fitch'
  157. - object_field_set:
  158. object: product
  159. field: brand
  160. - find:
  161. path: p.details__description
  162. do:
  163. - parse
  164. - space_dedupe
  165. - trim
  166. - object_field_set:
  167. object: product
  168. field: description
  169. - find:
  170. path: script:contains('var globalProducts')
  171. do:
  172. - parse:
  173. filter: globalProducts\[\d+\]\s+\=\s+(.+);\s+\}\s+catch\(err\)
  174. - normalize:
  175. routine: json2xml
  176. - to_block
  177. - find:
  178. path: offerpricefmt
  179. slice: 0
  180. do:
  181. - parse
  182. - variable_set: price
  183. - find:
  184. in: doc
  185. path: div.upper-breadcrumb>div.breadcrumbs>a
  186. do:
  187. - parse
  188. - space_dedupe
  189. - trim
  190. - if:
  191. match: \w+
  192. do:
  193. - object_field_set:
  194. object: product
  195. field: category
  196. joinby: "|"
  197. - find:
  198. path: ul.product-swatches>li>label>span,ul.product-swatches>li>h2>span
  199. do:
  200. - parse
  201. - space_dedupe
  202. - trim
  203. - if:
  204. match: \w+
  205. do:
  206. - object_field_set:
  207. object: product
  208. field: variations
  209. joinby: "|"
  210. - find:
  211. path: span.product-price-v2__price
  212. do:
  213. - parse:
  214. filter:
  215. - (\$[0-9\.]+)\s*-
  216. - (\$[0-9\.]+)
  217. - variable_set: price
  218. - variable_get: price
  219. - normalize:
  220. routine: replace_substring
  221. args:
  222. \$: ''
  223. - object_field_set:
  224. object: product
  225. type: float
  226. field: price
  227. - variable_get: price
  228. - normalize:
  229. routine: replace_matched
  230. args:
  231. \$: USD
  232. - object_field_set:
  233. object: product
  234. field: currency
  235. - variable_get: pageurl
  236. - if:
  237. match: \/kids\/
  238. do:
  239. - register_set: https://anf.scene7.com/is/image/anf?imageset={anf/kids_<%pid%>_<%cid%>}&req=set,json&id=<%pid%>
  240. else:
  241. - register_set: https://anf.scene7.com/is/image/anf?imageset={anf/anf_<%pid%>_<%cid%>}&req=set,json&id=<%pid%>
  242. - walk:
  243. to: value
  244. do:
  245. - find:
  246. path: body
  247. do:
  248. - parse:
  249. filter: s7jsonResponse\((.+),\&quot;\d+\&quot;\);
  250. - normalize:
  251. routine: unescape_html
  252. - normalize:
  253. routine: json2xml
  254. - to_block
  255. - find:
  256. path: i>n
  257. do:
  258. - variable_clear: iurl
  259. - parse
  260. - variable_set: iurl
  261. - register_set: https://anf.scene7.com/is/image/<%iurl%>?scl=1
  262. - object_field_set:
  263. object: product
  264. field: images
  265. joinby: "|"
  266. - find:
  267. path: ul.product-swatches>li>label>input
  268. do:
  269. - variable_clear: cid
  270. - parse:
  271. attr: data-seq
  272. - variable_set: cid
  273. - variable_get: pageurl
  274. - if:
  275. match: \/kids\/
  276. do:
  277. - register_set: https://anf.scene7.com/is/image/anf?imageset={anf/kids_<%pid%>_<%cid%>}&req=set,json&id=<%pid%>
  278. else:
  279. - register_set: https://anf.scene7.com/is/image/anf?imageset={anf/anf_<%pid%>_<%cid%>}&req=set,json&id=<%pid%>
  280. - walk:
  281. to: value
  282. do:
  283. - find:
  284. path: body
  285. do:
  286. - parse:
  287. filter: s7jsonResponse\((.+),\&quot;\d+\&quot;\);
  288. - normalize:
  289. routine: unescape_html
  290. - normalize:
  291. routine: json2xml
  292. - to_block
  293. - find:
  294. path: i>n
  295. do:
  296. - variable_clear: iurl
  297. - parse
  298. - variable_set: iurl
  299. - register_set: https://anf.scene7.com/is/image/<%iurl%>?scl=1
  300. - object_field_set:
  301. object: product
  302. field: images
  303. joinby: "|"
  304. - object_save:
  305. name: product

Report this snippet  

You need to login to post a comment.