Beginner at Python here, this question is long but it's simple I promise.
I'm scraping the webpages of a list of URLs for all kinds of data and making spreadsheets to search for trends. The one thing I cannot figure out is how to target is the presence of a Facebook pixel. The sites that I'm pulling data from are all Shopify sites, so the pixel exists in the source code under a script like this:
<script class="analytics" type="text/javascript">
(function () {
var customDocumentWrite = function(content) {
var jquery = null;
if (window.jQuery) {
jquery = window.jQuery;
} else if (window.Checkout && window.Checkout.$) {
jquery = window.Checkout.$;
}
if (jquery) {
jquery('body').append(content);
}
};
var trekkie = window.ShopifyAnalytics.lib = window.trekkie = window.trekkie || [];
if (trekkie.integrations) {
return;
}
trekkie.methods = [
'identify',
'page',
'ready',
'track',
'trackForm',
'trackLink'
];
trekkie.factory = function(method) {
return function() {
var args = Array.prototype.slice.call(arguments);
args.unshift(method);
trekkie.push(args);
return trekkie;
};
};
for (var i = 0; i < trekkie.methods.length; i++) {
var key = trekkie.methods[i];
trekkie[key] = trekkie.factory(key);
}
trekkie.load = function(config) {
trekkie.config = config;
var script = document.createElement('script');
script.type = 'text/javascript';
script.onerror = function(e) {
(new Image()).src = '//v.shopify.com/internal_errors/track?error=trekkie_load';
};
script.async = true;
script.src = 'http://ift.tt/2wzmuPS';
var first = document.getElementsByTagName('script')[0];
first.parentNode.insertBefore(script, first);
};
trekkie.load(
{"Trekkie":{"appName":"storefront","development":false,"defaultAttributes":{"shopId":21353283,"isMerchantRequest":null,"themeId":177557972,"themeCityHash":9027251861051819334}},"Performance":{"navigationTimingApiMeasurementsEnabled":true,"navigationTimingApiMeasurementsSampleRate":0.1},"Facebook Pixel":{"pixelIds":["1930018160600140"],"agent":"plshopify1.2"},"Session Attribution":{}}
);
var loaded = false;
trekkie.ready(function() {
if (loaded) return;
loaded = true;
window.ShopifyAnalytics.lib = window.trekkie;
var originalDocumentWrite = document.write;
document.write = customDocumentWrite;
try { window.ShopifyAnalytics.merchantGoogleAnalytics.call(this); } catch(error) {};
document.write = originalDocumentWrite;
window.ShopifyAnalytics.lib.page(
null,
{"pageType":"home"}
);
});
var eventsListenerScript = document.createElement('script');
eventsListenerScript.async = true;
eventsListenerScript.src = "//cdn.shopify.com/s/assets/shop_events_listener-4c5801cae3452eff0ededa0ac07d432c1240b78b7e11282cceb3c3213951104b.js";
document.getElementsByTagName('head')[0].appendChild(eventsListenerScript);
})();
</script>
The nice thing about scraping Shopify sites is that they are, for the most part, uniform. If a site is using a FB pixel it will appear in this segment of the code pasted above:
trekkie.load(
{"Trekkie":{"appName":"storefront","development":false,"defaultAttributes":{"shopId":21353283,"isMerchantRequest":null,"themeId":177557972,"themeCityHash":9027251861051819334}},"Performance":{"navigationTimingApiMeasurementsEnabled":true,"navigationTimingApiMeasurementsSampleRate":0.1},"Facebook Pixel":{"pixelIds":["1930018160600140"],"agent":"plshopify1.2"},"Session Attribution":{}}
);
So, I need to make it so that "Yes" appears in my spreadsheet column if the source code of the page contains the word "Facebook Pixel". My code basically looks like this:
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'www.example.com'
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("div",{"class":"item-container"})
container = containers[0]
filename = "web_scrape.csv"
f = open(filename, "w")
headers = "brand, product_name, shipping\n"
f.write(headers)
for container in containers:
brand = container.div.div.a.img["title"]
title_container = container.findAll("a", {"class":"item-title"})
product_name = title_container[0].text
shipping_container = container.findAll("li", {"class":"price-ship"})
shipping = shipping_container[0].text.strip()
print("brand: " + brand)
print("product_name: " + product_name)
print("shipping: " + shipping)
f.write(brand + "," + product_name.replace(",", "|") + ","+ shipping + "\n")
f.close()
Aucun commentaire:
Enregistrer un commentaire