from bs4 import BeautifulSoup
import html5lib
import requests
import time
tw_link = open("TW_Links.txt","r", encoding = 'utf-8')
im_link = open("DCDN_Links.txt","w+")
kak_link = open("KCDN_Links.txt","w+")
def get_images(urlset):
for x in urlset:
rs = requests.Session()
urls=rs.get(x)
soup = BeautifulSoup(urls.text , "html5lib")
content = soup.find("div", {"class": "tt_article_useless_p_margin"})
images = content.findAll('img')
for img in images:
img_url = img['src']+"?original"
if "blog" in img_url:
print(img_url,file=kak_link)
print(img_url)
print(img_url,file=im_link)
print(img_url)
time.sleep(2)
def get_links():
count=1
linklist = []
for line in tw_link:
line = line.replace("\n","")
linklist.append(line)
get_images(linklist)
get_links()
For those waiting for a solution, it was pretty simple, i was doubtful of the request module so i intercepted the traffic from the program using proxy and voila turns out the request module also included EOL symbol in the request as well, while it might've worked with most sites this particular site redirected to the 404 Page, so a simple removal of "\n" from the lines read did the trick.