Подпишитесь на рассылку о самых интересных материалах в мире веб-разработки :)

Создание веб-парсера для инстаграм


(Gtufc) #1

Добрый день! Есть ссылка Города Италии необходимо для каждого города Италии вывести список всех мест в виде ссылок вида =>
https://www.instagram.com/explore/locations/213158406/ в отдельный файл? Какие посоветуете идеи, ссылки на ресурсы?


(Сергей) #2

Веб-парсер? - Nokogiri!


(Gtufc) #3

Дело в том, что я получаю не html структуру, а коллекцию window._sharedData и как получить все id не вида “212911294”, а вида 212911294 списка location_list, я не разобрался еще. Еще не понимаю как реализовать список страниц, которые динамически подгружаются после нажатия кнопки Еще
Вытащить id location_list со всех страниц.

<script type="text/javascript">window._sharedData = {"activity_counts": null, "config": {"csrf_token": "KWDG5RrCyJDMG1heSXsb100pU3aSNfb1", "viewer": null}, "country_code": "BY", "language_code": "ru", "entry_data": {"LocationsDirectoryPage": [{"city_directory_page": true, "country_info": {"id": "US", "name": "United States", "slug": "united-states"}, "city_info": {"id": "c2490299", "name": "New York", "slug": "new-york-united-states"}, "location_list": [{"id": "212988663", "name": "New York, New York", "slug": "new-york-new-york"}, {"id": "49695104", "name": "Brooklyn Bridge", "slug": "brooklyn-bridge"}, {"id": "1553715", "name": "Barclays Center", "slug": "barclays-center"}, {"id": "34648", "name": "National September 11 Memorial \u0026 Museum", "slug": "national-september-11-memorial-museum"}, {"id": "1495", "name": "Washington Square Park", "slug": "washington-square-park"}, {"id": "504854015", "name": "SoHo, Manhattan", "slug": "soho-manhattan"}, {"id": "14036", "name": "Webster Hall", "slug": "webster-hall"}, {"id": "399440619", "name": "One World Trade Center", "slug": "one-world-trade-center"}, {"id": "23023", "name": "DUMBO, Brooklyn", "slug": "dumbo-brooklyn"}, {"id": "262783", "name": "Irving Plaza", "slug": "irving-plaza"}, {"id": "212911294", "name": "Statue of Liberty National Monument", "slug": "statue-of-liberty-national-monument"}, {"id": "243068390", "name": "East Village, Manhattan", "slug": "east-village-manhattan"}, {"id": "3066", "name": "Madison Square Park", "slug": "madison-square-park"}, {"id": "234297315", "name": "Lower East Side", "slug": "lower-east-side"}, {"id": "1458018", "name": "Union Square Park", "slug": "union-square-park"}, {"id": "5927494", "name": "Dominique Ansel Bakery", "slug": "dominique-ansel-bakery"}, {"id": "312595389", "name": "Skylight Clarkson Sq", "slug": "skylight-clarkson-sq"}, {"id": "20693", "name": "Midtown Manhattan", "slug": "midtown-manhattan"}, {"id": "293007743", "name": "World Trade Center 9/11 Memorial, NYC", "slug": "world-trade-center-911-memorial-nyc"}, {"id": "1021752827", "name": "Love", "slug": "love"}, {"id": "505179821", "name": "West Village", "slug": "west-village"}, {"id": "21543", "name": "SoHo", "slug": "soho"}, {"id": "213443263", "name": "Chinatown, Manhattan", "slug": "chinatown-manhattan"}, {"id": "3001685", "name": "Williamsburg Bridge", "slug": "williamsburg-bridge"}, {"id": "1859885", "name": "Brooklyn Bridge Park", "slug": "brooklyn-bridge-park"}, {"id": "523486691", "name": "Greenwich Village", "slug": "greenwich-village"}, {"id": "1032844147", "name": "9/11 Memorial", "slug": "911-memorial"}, {"id": "302575463", "name": "World Trade Center", "slug": "world-trade-center"}, {"id": "3002301", "name": "The Bowery Ballroom", "slug": "the-bowery-ballroom"}, {"id": "6639340", "name": "Manhattan Bridge", "slug": "manhattan-bridge"}, {"id": "451362495", "name": "ONE WORLD OBSERVATORY", "slug": "one-world-observatory"}, {"id": "215275862", "name": "Wall Street", "slug": "wall-street"}, {"id": "2962165", "name": "Smorgasburg", "slug": "smorgasburg"}, {"id": "19001", "name": "Rockwood Music Hall", "slug": "rockwood-music-hall"}, {"id": "531233", "name": "Beauty \u0026 Essex", "slug": "beauty-essex"}, {"id": "3001670", "name": "Brooklyn Bridge", "slug": "brooklyn-bridge"}, {"id": "1519", "name": "New Museum", "slug": "new-museum"}, {"id": "374339268", "name": "Little Italy in NYC", "slug": "little-italy-in-nyc"}, {"id": "1017684341", "name": "Mr. Purple", "slug": "mr-purple"}, {"id": "213123705", "name": "Battery Park", "slug": "battery-park"}, {"id": "225773009", "name": "The Brooklyn Bridge New York City", "slug": "the-brooklyn-bridge-new-york-city"}, {"id": "25449", "name": "Flatiron District, NYC", "slug": "flatiron-district-nyc"}, {"id": "212995471", "name": "Lower Manhattan", "slug": "lower-manhattan"}, {"id": "215391845", "name": "9/11 National Memorial, New York City", "slug": "911-national-memorial-new-york-city"}, {"id": "213321201", "name": "Eataly NYC Flatiron", "slug": "eataly-nyc-flatiron"}, {"id": "24408851", "name": "The Royal Palms Shuffleboard Club", "slug": "the-royal-palms-shuffleboard-club"}, {"id": "1721707", "name": "Katzs Delicatessen", "slug": "katzs-delicatessen"}, {"id": "265952594", "name": "China Town, Manhattan, New York City", "slug": "china-town-manhattan-new-york-city"}], "next_page": 2, "logging_page_id": "locationPage_c2490299"}]}, "gatekeepers": {"bn": true, "ld": true, "nr": true, "pl": true}, "qe": {"dash_for_vod": {"g": "", "p": {}}, "ebd": {"g": "", "p": {}}, "bc3l": {"g": "", "p": {}}, "aysf": {"g": "", "p": {}}, "notif": {"g": "", "p": {}}, "create_upsell": {"g": "", "p": {}}, "feed": {"g": "", "p": {}}, "follow_button": {"g": "", "p": {}}, "login_via_signup_page": {"g": "", "p": {}}, "loggedout": {"g": "", "p": {}}, "stories": {"g": "", "p": {}}, "su_universe": {"g": "", "p": {}}, "us": {"g": "", "p": {}}, "us_li": {"g": "", "p": {}}, "nav": {"g": "", "p": {}}, "nav_lo": {"g": "", "p": {}}, "deact": {"g": "", "p": {}}, "sidecar": {"g": "", "p": {}}, "video": {"g": "", "p": {}}, "filters": {"g": "", "p": {}}, "typeahead": {"g": "", "p": {}}, "appsell": {"g": "", "p": {}}, "save": {"g": "", "p": {}}, "stale": {"g": "", "p": {}}, "reg": {"g": "", "p": {}}, "reg_vp": {"g": "test_group_2", "p": {"hide_value_prop": "true"}}, "nux": {"g": "", "p": {}}, "prof_pic_upsell": {"g": "", "p": {}}, "prof_pic_creation": {"g": "", "p": {}}, "onetaplogin": {"g": "", "p": {}}, "feed_vp": {"g": "", "p": {}}, "push_notifications": {"g": "", "p": {}}, "login_poe": {"g": "", "p": {}}, "feed_perf": {"g": "", "p": {}}, "prefetch": {"g": "", "p": {}}, "report_haf": {"g": "", "p": {}}, "report_category_reorder": {"g": "", "p": {}}, "a2hs": {"g": "", "p": {}}, "bg_sync": {"g": "", "p": {}}, "disc_ppl": {"g": "", "p": {}}}, "hostname": "www.instagram.com", "display_properties_server_guess": {"pixel_ratio": 1.0, "viewport_width": 1366, "viewport_height": 662, "orientation": "landscape-primary"}, "environment_switcher_visible_server_guess": true, "platform": "web", "nonce": "TQP4gnBRo/Kc74mPZ31Ovg==", "zero_data": {}, "rollout_hash": "25b7219f6781", "probably_has_app": false, "show_app_install": true};</script>

только получение одного id пока

require 'open-uri’
require ‘JSON’

def scrape_instagram(location)
begin
instagram_source = open(“https://www.instagram.com/explore/locations/c2490299/#{location}”).read
content = JSON.parse(instagram_source.split(“window._sharedData = “)[1].split(”;”)[0])
return content[‘entry_data’][‘LocationsDirectoryPage’][0][‘location_list’][0][‘id’]
rescue Exception => e
return nil
end
end

p scrape_instagram(“new-york-united-states/”)