lundi 2 mai 2016

Python - Web scraping Issue

I'm having problems with my web scraper when trying to get the channel title. I'm not sure how to fix it, however by doing some testing with the channel function, it seems that the video links work alongside it, when only channel links should work with the YoutubeChannel function.

Any ideas on how to fix it?

#Required Modules
import urllib
import re

#Defining the YouTube Video function
def YoutubeVideo():
    #Making videoLink equal to whatever the user enters as their video link
    videoLink = input ('\nWhat is your video link? (In quotations, with http included)\n')

    #Goes to the video URL, opens it and reads the HTML file
    htmlfile = urllib.urlopen(videoLink) #Searches for this URL
    htmltext = htmlfile.read() #Reads the HTML file and sets it to htmltext

    #Setup for the view counter
    regexView = "<div class=\"watch-view-count\">(.+?)</div>" #Searches for the view count number and sets it to regexView
    pattern = re.compile(regexView)
    viewCount = re.findall(pattern, htmltext) 

    #Setup for the video title
    regexTitle = "<title>(.+?)</title>" #Searches for the title of the video
    patternTitle = re.compile(regexTitle)
    videoTitle = re.findall(patternTitle, htmltext)

    #Setup for the video upload date
    regexUpload = "<strong class=\"watch-time-text\">(.+?)</strong>"
    patternUpload = re.compile(regexUpload)
    videoUpload = re.findall(patternUpload, htmltext)

    print ("\n%s" % (videoLink)) #Prints the video link, primarily for testing
    print ("\nThe title of your video is %s and has %s views.\nIt was %s." % (videoTitle, viewCount, videoUpload)) #Prints the information about the video


#Defining the YouTube Channel function
def YoutubeChannel():
    #Making channelLink equal to whatever the user enters as their video link
    channelLink = input ('\nWhat is your channel link? (In quotations, with http included)\n')

    #Goes to the video URL, opens it and reads the HTML file
    htmlfile = urllib.urlopen(channelLink) #Searches for this URL
    htmltext = htmlfile.read() #Reads the HTML file and sets it to htmltext

    #Setup for the channel name
    channelTitle = "<title>(.+?)</title>" #Searches for the title of the video
    patternChannelTitle = re.compile(channelTitle)
    channelTitle = re.findall(patternChannelTitle, htmltext)

    print (channelTitle)



ans  = True
while ans: 
    print ("\n[1] Get information regarding a YouTube video.")
    print ("\n[2] Get information regarding a YouTube channel.")
    print ("\n[Q] Quit the application.")

    ans = raw_input("\nWhat would you like to do now? ")
    if ans == "1":
        YoutubeVideo()
    elif ans == "2":
        YoutubeChannel()
    elif ans == "q":
        sys.exit(0)
    elif ans != "":
        print "Not a valid choice, try again."




Aucun commentaire:

Enregistrer un commentaire