Return to Snippet

Revision: 70910
at September 10, 2017 03:02 by martinson


Updated Code
---
config:
    agent: Firefox
iterator:
    - type: fieldset
      fields:
      - user: someusername
do:
- walk:
    to: https://www.instagram.com/<%user%>/
    do:
    - variable_set:
        field: repeat
        value: 'no'
    - variable_clear: queryid
    - find:
        path: body
        do:
        - parse:
            filter: window\._sharedData\s+\=\s+([^;]+);
        - normalize:
            routine: json2xml
        - to_block
        - find:
            path: config>csrf_token
            do:
            - parse
            - variable_set: token
            - cookie_get: mid
            - variable_set: mid
            
    - find:
        path: script[type="text/javascript"]
        do:
        - parse:
            attr: src
        - if:
            match: Commons\.js
            do:
            - normalize:
                routine: url
            - walk:
                to: value
                headers:
                    Cookue: csrftoken=<%token%>; mid=<%mid%>;
                do:
                - find:
                    path: script
                    do:
                    - parse
                    - normalize:
                        routine: unescape_html
                    
                    - filter:
                        args:
                            return\s*e\.profilePosts\.byUserId\.get\(t\)\.pagination\}\,queryId\:\"(\d+)\"
                    - if:
                        match: \d+
                        do:
                        - variable_set: queryid
    - find:
        path: body
        do:
        - cookie_get: mid
        - variable_set: mid
        - parse:
            filter: window\._sharedData\s+\=\s+([^;]+);
        - normalize:
            routine: json2xml
        - to_block
        - find:
            path: config>csrf_token
            do:
            - parse
            - variable_set: token
        - find:
            path: entry_data>profilepage
            do:
            - register_set: https://www.instagram.com/p
            - variable_set: baseurl
            - object_new: user
            - find:
                path: user>id
                do:
                - parse
                - object_field_set:
                    object: user
                    field: id
                - variable_set: userid
            - find:
                path: user>username
                do:
                - parse
                - object_field_set:
                    object: user
                    field: username
            - find:
                path: user>full_name
                do:
                - parse
                - object_field_set:
                    object: user
                    field: full_name
            - find:
                path: user>biography
                do:
                - parse
                - object_field_set:
                    object: user
                    field: biography
            - find:
                path: user>profile_pic_url
                do:
                - parse
                - object_field_set:
                    object: user
                    field: profile_pic_url
            - find:
                path: user>profile_pic_url_hd
                do:
                - parse
                - object_field_set:
                    object: user
                    field: profile_pic_url_hd
            - find:
                path: user>external_url
                do:
                - parse
                - object_field_set:
                    object: user
                    field: external_url
            - find:
                path: user>external_url_linkshimmed
                do:
                - parse
                - object_field_set:
                    object: user
                    field: external_url_linkshimmed
            - find:
                path: user>country_block
                do:
                - parse
                - object_field_set:
                    object: user
                    field: country_block
            - find:
                path: user>follows>count
                do:
                - parse
                - object_field_set:
                    object: user
                    field: follows
            - find:
                path: user>followed_by>count
                do:
                - parse
                - object_field_set:
                    object: user
                    field: followed_by
            - find:
                path: user>media>nodes
                do:
                - object_new: nodes
                - find:
                    path: id
                    do:
                    - parse
                    - object_field_set:
                        object: nodes
                        field: id
                - find:
                    path: is_video
                    do:
                    - parse
                    - object_field_set:
                        object: nodes
                        field: is_video
                - find:
                    path: video_views
                    do:
                    - parse
                    - object_field_set:
                        object: nodes
                        field: video_views
                - find:
                    path: date
                    do:
                    - parse
                    - normalize:
                        routine: date_format
                        args:
                            format_in: '%s'
                            format_out: '%Y-%m-%d %H:%M:%S'
                    - object_field_set:
                        object: nodes
                        field: date
                - find:
                    path: dimensions>width
                    do:
                    - parse
                    - object_field_set:
                        object: nodes
                        field: width
                - find:
                    path: dimensions>height
                    do:
                    - parse
                    - object_field_set:
                        object: nodes
                        field: height
                - find:
                    path: likes>count
                    do:
                    - parse
                    - object_field_set:
                        object: nodes
                        field: likes_count
                - find:
                    path: comments>count
                    do:
                    - parse
                    - object_field_set:
                        object: nodes
                        field: comments_count
                - find:
                    path: comments_disabled
                    do:
                    - parse
                    - object_field_set:
                        object: nodes
                        field: comments_disabled
                - find:
                    path: caption_safe
                    do:
                    - parse
                    - object_field_set:
                        object: nodes
                        field: caption
                - find:
                    path: thumbnail_src
                    do:
                    - parse
                    - object_field_set:
                        object: nodes
                        field: thumbnail
                - find:
                    path: display_src
                    do:
                    - parse
                    - object_field_set:
                        object: nodes
                        field: media
                - object_save:
                    name: nodes
                    to: user
            - find:
                path: user>media>page_info
                do:
                - find:
                    path: has_next_page
                    do:
                    - parse
                    - if:
                        match: true
                        do:
                        - variable_set:
                            field: repeat
                            value: 'yes'
                - find:
                    path: end_cursor
                    do:
                    - parse
                    - eval:
                        routine: js
                        body: '(function () {return encodeURIComponent("<%register%>")})();'
                    - variable_set: cursor
            - walk:
                to: https://www.instagram.com/graphql/query/?query_id=<%queryid%>&variables=%7B%22id%22%3A%22<%userid%>%22%2C%22first%22%3A12%2C%22after%22%3A%22<%cursor%>%22%7D
                repeat: <%repeat%>
                headers:
                    accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
                    accept-language: ru-RU,ru;q=0.8,en-US;q=0.6,en;q=0.4
                    cache-control: max-age=0
                    upgrade-insecure-requests: 1
                    Cookie: mid=<%mid%>;
                do:
                - sleep: 5
                - variable_set:
                    field: repeat
                    value: 'no'
                - find:
                    path: edge_owner_to_timeline_media>page_info
                    do:
                    - find:
                        path: has_next_page
                        do:
                        - parse
                        - if:
                            match: true
                            do:
                            - variable_set:
                                field: repeat
                                value: 'yes'
                    - find:
                        path: end_cursor
                        do:
                        - parse
                        - eval:
                            routine: js
                            body: '(function () {return encodeURIComponent("<%register%>")})();'
                        - variable_set: cursor
                - find:
                    path: edge_owner_to_timeline_media>count
                    do:
                    - parse
                    - object_field_set:
                        object: user
                        field: media_count
                - find:
                    path: edge_owner_to_timeline_media>edges>node
                    do:
                    - object_new: nodes
                    - find:
                        path: id
                        do:
                        - parse
                        - object_field_set:
                            object: nodes
                            field: id
                    - find:
                        path: is_video
                        do:
                        - parse
                        - object_field_set:
                            object: nodes
                            field: is_video
                    - find:
                        path: video_views
                        do:
                        - parse
                        - object_field_set:
                            object: nodes
                            field: video_views
                    - find:
                        path: taken_at_timestamp
                        do:
                        - parse
                        - normalize:
                            routine: date_format
                            args:
                                format_in: '%s'
                                format_out: '%Y-%m-%d %H:%M:%S'
                        - object_field_set:
                            object: nodes
                            field: date
                    - find:
                        path: dimensions>width
                        do:
                        - parse
                        - object_field_set:
                            object: nodes
                            field: width
                    - find:
                        path: dimensions>height
                        do:
                        - parse
                        - object_field_set:
                            object: nodes
                            field: height
                    - find:
                        path: edge_media_preview_like>count
                        do:
                        - parse
                        - object_field_set:
                            object: nodes
                            field: likes_count
                    - find:
                        path: edge_media_to_comment>count
                        do:
                        - parse
                        - object_field_set:
                            object: nodes
                            field: comments_count
                    - find:
                        path: comments_disabled
                        do:
                        - parse
                        - object_field_set:
                            object: nodes
                            field: comments_disabled
                    - find:
                        path: edge_media_to_caption
                        do:
                        - parse
                        - object_field_set:
                            object: nodes
                            field: caption
                    - find:
                        path: thumbnail_src
                        do:
                        - parse
                        - object_field_set:
                            object: nodes
                            field: thumbnail
                    - find:
                        path: display_url
                        do:
                        - parse
                        - object_field_set:
                            object: nodes
                            field: media
                    - object_save:
                        name: nodes
                        to: user
            - object_save:
                    name: user

Revision: 70909
at September 13, 2016 22:46 by martinson


Initial Code
---
config:
    agent: Firefox
iterator:
    - type: fieldset
      fields:
      - user: somusername
        itemstoget: 30
        mode: simple 
do:
- walk:
    to: https://www.instagram.com/<%user%>/
    do:
    - find:
        path: body
        do:
        - parse:
            filter: window\._sharedData\s+\=\s+([^;]+);
        - normalize:
            routine: json2xml
        - to_block
        - find:
            path: config>csrf_token
            do:
            - parse
            - variable_set: token
        - find:
            path: entry_data>profilepage
            do:
            - register_set: https://www.instagram.com/p
            - variable_set: baseurl
            - object_new: user
            - find:
                path: user>id
                do:
                - parse
                - object_field_set:
                    object: user
                    field: id
                - variable_set: userid
            - find:
                path: user>username
                do:
                - parse
                - object_field_set:
                    object: user
                    field: username
            - find:
                path: user>full_name
                do:
                - parse
                - object_field_set:
                    object: user
                    field: full_name
            - find:
                path: user>biography
                do:
                - parse
                - object_field_set:
                    object: user
                    field: biography
            - find:
                path: user>profile_pic_url
                do:
                - parse
                - object_field_set:
                    object: user
                    field: profile_pic_url
            - find:
                path: user>profile_pic_url_hd
                do:
                - parse
                - object_field_set:
                    object: user
                    field: profile_pic_url_hd
            - find:
                path: user>external_url
                do:
                - parse
                - object_field_set:
                    object: user
                    field: external_url
            - find:
                path: user>external_url_linkshimmed
                do:
                - parse
                - object_field_set:
                    object: user
                    field: external_url_linkshimmed
            - find:
                path: user>country_block
                do:
                - parse
                - object_field_set:
                    object: user
                    field: country_block
            - find:
                path: user>follows>count
                do:
                - parse
                - object_field_set:
                    object: user
                    field: follows
            - find:
                path: user>followed_by>count
                do:
                - parse
                - object_field_set:
                    object: user
                    field: followed_by
            - find:
                path: user>media>page_info>start_cursor
                do:
                - parse
                - variable_set: cursor
            - argument_get: itemstoget
            - if:
                match: all
                do:
                - find:
                    path: user>media>count
                    do:
                    - parse
                    - variable_set: items_get
                else:
                - variable_set: items_get
            - walk:
                to:
                    post: https://www.instagram.com/query/
                    headers:
                        x-csrftoken: <%token%>
                        x-instagram-ajax: 1
                        x-requested-with: XMLHttpRequest
                    data:
                        q: 'ig_user(<%userid%>) { media.after(<%cursor%>, <%items_get%>) {
  count,
  nodes {
    caption,
    code,
    comments {
      count
    },
    comments_disabled,
    date,
    dimensions {
      height,
      width
    },
    display_src,
    id,
    is_video,
    likes {
      count
    },
    owner {
      id
    },
    thumbnail_src,
    video_views
  },
  page_info
}
 }'
                        ref: 'users::show'
                do:
                - find:
                    path: media>count
                    do:
                    - parse
                    - object_field_set:
                        object: user
                        field: media_count
                - find:
                    path: media>nodes
                    do:
                    - object_new: nodes
                    - find:
                        path: id
                        do:
                        - parse
                        - object_field_set:
                            object: nodes
                            field: id
                    - find:
                        path: is_video
                        do:
                        - parse
                        - object_field_set:
                            object: nodes
                            field: is_video
                    - find:
                        path: video_views
                        do:
                        - parse
                        - object_field_set:
                            object: nodes
                            field: vide_views
                    - find:
                        path: date
                        do:
                        - parse
                        - normalize:
                            routine: date_format
                            args:
                                format_in: '%s'
                                format_out: '%Y-%m-%d %H:%M:%S'
                        - object_field_set:
                            object: nodes
                            field: date
                    - find:
                        path: dimensions>width
                        do:
                        - parse
                        - object_field_set:
                            object: nodes
                            field: width
                    - find:
                        path: dimensions>height
                        do:
                        - parse
                        - object_field_set:
                            object: nodes
                            field: height
                    - find:
                        path: likes>count
                        do:
                        - parse
                        - object_field_set:
                            object: nodes
                            field: likes_count
                    - find:
                        path: comments>count
                        do:
                        - parse
                        - object_field_set:
                            object: nodes
                            field: comments_count
                    - find:
                        path: comments_disabled
                        do:
                        - parse
                        - object_field_set:
                            object: nodes
                            field: comments_disabled
                    - find:
                        path: caption_safe
                        do:
                        - parse
                        - object_field_set:
                            object: nodes
                            field: caption
                    - find:
                        path: thumbnail_src
                        do:
                        - parse
                        - object_field_set:
                            object: nodes
                            field: thumbnail
                    - find:
                        path: display_src
                        do:
                        - parse
                        - object_field_set:
                            object: nodes
                            field: media
                    - find:
                        path: code
                        do:
                        - parse
                        - variable_prepend:
                            field: baseurl
                            joinby: "/"
                        - object_field_set:
                            object: nodes
                            field: url
                        - variable_set: node_url
                        - argument_get: mode
                        - if:
                            match: extended
                            do:
                            - variable_get: node_url
                            - walk:
                                to: value
                                do:
                                - find:
                                    path: body
                                    do:
                                    - parse:
                                        filter: window\._sharedData\s+\=\s+([^;]+);
                                    - normalize:
                                        routine: json2xml
                                    - to_block
                                    - find:
                                        path: entry_data>postpage>media>comments>nodes
                                        do:
                                        - object_new: comments
                                        - find:
                                            path: '>id'
                                            do:
                                            - parse
                                            - object_field_set:
                                                object: comments
                                                field: id
                                        - find:
                                            path: user>id
                                            do:
                                            - parse
                                            - object_field_set:
                                                object: comments
                                                field: user_id
                                        - find:
                                            path: user>username
                                            do:
                                            - parse
                                            - object_field_set:
                                                object: comments
                                                field: username
                                        - find:
                                            path: user>profile_pic_url
                                            do:
                                            - parse
                                            - object_field_set:
                                                object: comments
                                                field: user_profile_pic
                                        - find:
                                            path: text
                                            do:
                                            - parse
                                            - object_field_set:
                                                object: comments
                                                field: text
                                        - find:
                                            path: created_at
                                            do:
                                            - parse:
                                                filter: (\d+)\.\d+
                                            - normalize:
                                                routine: date_format
                                                args:
                                                    format_in: '%s'
                                                    format_out: '%Y-%m-%d %H:%M:%S'
                                            - object_field_set:
                                                object: comments
                                                field: date
                                        - object_save:
                                            name: comments
                                            to: nodes
                                    - find:
                                        path: entry_data>postpage>media>likes>nodes
                                        do:
                                        - object_new: likes
                                        - find:
                                            path: user>id
                                            do:
                                            - parse
                                            - object_field_set:
                                                object: likes
                                                field: user_id
                                        - find:
                                            path: user>username
                                            do:
                                            - parse
                                            - object_field_set:
                                                object: likes
                                                field: username
                                        - find:
                                            path: user>profile_pic_url
                                            do:
                                            - parse
                                            - object_field_set:
                                                object: likes
                                                field: user_profile_pic
                                        - object_save:
                                            name: likes
                                            to: nodes
                    - object_save:
                        name: nodes
                        to: user 
            - object_save:
                    name: user

Initial URL
https://www.diggernaut.com

Initial Description
Hey guys,

Sharing my diggernaut's scripts for web scraping, hope it will be useful for you.

This is script for scraping user's accounts without logging in to instagram, so no risk. What you can get with it: all information about user (his full name, username, id, avatar, number of follows and followers, number of posts), information about his posts (url, image, number of likes, information about persons who liked, comments, caption). So you probably will be interested in getting not all posts but just lets say 10 (or 30) of most recent. You can adjust it with settings. You can also set mode for script, to simple or extended. In simple mode it will not retrieve list of persons who likes post and comments. It works faster and eats less bandwidth in this case.

If you look into script (lines 5-9):

    - type: fieldset
      fields:
      - user: somusername

you can see settings you may adjust, instead of "someusername" you need to set username of instagram account you want to scrape.  You can set multiple users to scrape, you can need in this case add additional settings chunks, like below:

    - type: fieldset
      fields:
      - user: somusername1
      - user: somusername2

Initial Title
Public Instagram User Scraper

Initial Tags
data, web

Initial Language
Other