import re import sys import requests import os import subprocess import time import codecs ##### dictionary definition. This lists the channels that will be scraped from the website. Format: #### "Name of the channel as it appears on the website":, "Number the channel will be assigned on the generated XML file" channels_to_grab = { "Cultura": "3.1", "Univesp TV": "3.2", "TV Educação": "3.3", "Record News": "6.1", "RecordTV Litoral-Vale": "8.1", "Rede 21": "10.1", "TVB Santos": "12.1", "RedeTV!": "14.1", "Rede Vida": "15.1", "TV Tribuna Santos": "18.1", "Gazeta": "32.1", "ISTV": "36.1", "RBI TV": "38.1", "TV Unisantos": "40.1", "VTV": "46.1", "Santa Cecília TV": "52.1", "IDTV": "56.1", } model_file = open("config_partial.xml","r"); config_file = codecs.open("WebGrab++.config.xml","w+","utf-8") for line in model_file: config_file.write(line); model_file.close(); user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36" website = requests.get("http://www.lineup.tv.br/gdc.php", headers={"User-Agent": user_agent}); config_file.write("\r\n"); for f in channels_to_grab: #this regex works because re.search looks for the given term in the entire string. Not only that, it only finds the proper "Guia=" because it looks for DIGITS after it. if it fins anything that's not digits (when it finds the first one), it skips that one and goes to the next print('canal: ',f); stripped = re.search('Guia=(\d+)">'+f+'<\/a>', website.text); config_file.write(""+f+"\r\n"); #print(""+f+"\r"); config_file.write("\n"); config_file.close(); print("WebGrab config file generated with the current channel codes for the channels requested. There's no error handling, so it must mean that the program finished sucessfully.");