-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathswft_twitter_crawler.py
More file actions
118 lines (94 loc) · 3.9 KB
/
swft_twitter_crawler.py
File metadata and controls
118 lines (94 loc) · 3.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import os
import requests
import tweepy
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from getpass import getpass
from concurrent.futures import ThreadPoolExecutor, as_completed
# Enter your Twitter API credentials
consumer_key = getpass("Enter your consumer key: ")
consumer_secret = getpass("Enter your consumer secret: ")
def get_links(url):
domain = urlparse(url).netloc
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'html.parser')
urls = []
for link in soup.find_all('a'):
link_url = link.get('href')
if link_url:
link_domain = urlparse(link_url).netloc
if not link_domain or link_domain == domain:
urls.append(link_url)
return urls
def get_twitter_api():
auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
return api
def get_twitter_username_from_url(url):
domain = urlparse(url).netloc
if "twitter.com" in domain:
return urlparse(url).path.split('/')[1]
return None
def get_tweets(twitter_username, api, tweet_count=10):
try:
tweets = api.user_timeline(screen_name=twitter_username, count=tweet_count, tweet_mode='extended')
return [tweet.full_text for tweet in tweets]
except tweepy.TweepError as e:
print(f"Error retrieving tweets for {twitter_username}: {e.reason}")
return []
def save_tweets_to_file(username, tweets):
folder = "twitter_data"
if not os.path.exists(folder):
os.makedirs(folder)
with open(os.path.join(folder, f"{username}.txt"), 'w', encoding='utf-8') as f:
for tweet in tweets:
f.write(f"{tweet}\n\n")
def crawl_and_find_twitter_links(start_url):
visited_urls = set()
urls_to_visit = [start_url]
twitter_links = []
while urls_to_visit:
current_url = urls_to_visit.pop()
if current_url not in visited_urls:
print(f"Visiting {current_url}")
visited_urls.add(current_url)
try:
new_links = get_links(current_url)
twitter_links.extend(link for link in new_links if "twitter.com" in urlparse(link).netloc)
urls_to_visit.extend(link for link in new_links if link not in visited_urls)
except Exception as e:
print(f"Error processing {current_url}: {e}")
return twitter_links
def process_link(current_url,twitter_api):
print(f"Visiting {current_url}")
try:
new_links = get_links(current_url)
twitter_links = [link for link in new_links if "twitter.com" in urlparse(link).netloc]
for twitter_link in set(twitter_links):
twitter_username = get_twitter_username_from_url(twitter_link)
if twitter_username:
tweets = get_tweets(twitter_username, twitter_api)
if tweets:
save_tweets_to_file(twitter_username, tweets)
print(f"Saved tweets for {twitter_username}")
return new_links
except Exception as e:
print(f"Error processing {current_url}: {e}")
return []
def crawl_twitter_data(start_url, max_threads=10):
twitter_api = get_twitter_api()
visited_urls = set()
urls_to_visit = [start_url]
with ThreadPoolExecutor(max_workers=max_threads) as executor:
while urls_to_visit:
futures = {executor.submit(process_link, url,twitter_api): url for url in urls_to_visit}
visited_urls |= set(urls_to_visit) # Add new links to visited_urls
urls_to_visit = []
for future in as_completed(futures):
new_links = future.result()
urls_to_visit.extend(
link for link in new_links if link not in visited_urls
)
if __name__ == "__main__":
start_url = 'https://www.swft.pro/'
crawl_twitter_data(start_url)