Skip to content

Commit 5151a50

Browse files
added insights
1 parent 812a787 commit 5151a50

File tree

3 files changed

+135
-0
lines changed

3 files changed

+135
-0
lines changed

README.md

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,59 @@
11
## Codeforces Problemset Scrapper
22
This is a Scrapy-based web scraper specifically designed for extracting detailed information about Codeforces problems. The Codeforces API lacks certain details such as URLs, number of people who have solved the problems, and problem ratings. This scraper solves that problem by extracting data for better problem analysis.
33

4+
5+
6+
### Below are some insights from the data
7+
Following topics are present (tags) in the problemset with number of problems.
8+
```
9+
{
10+
"2-sat": 34,
11+
"bitmasks": 559,
12+
"data structures": 1693,
13+
"dp": 2074,
14+
"graphs": 1045,
15+
"matrices": 119,
16+
"two pointers": 528,
17+
"strings": 714,
18+
"brute force": 1638,
19+
"constructive algorithms": 1706,
20+
"greedy": 2782,
21+
"implementation": 2681,
22+
"math": 2817,
23+
"number theory": 722,
24+
"binary search": 1030,
25+
"sortings": 1054,
26+
"combinatorics": 659,
27+
"games": 220,
28+
"hashing": 206,
29+
"interactive": 222,
30+
"dfs and similar": 914,
31+
"trees": 803,
32+
"dsu": 346,
33+
"divide and conquer": 283,
34+
"fft": 92,
35+
"geometry": 387,
36+
"string suffix structures": 90,
37+
"probabilities": 231,
38+
"meet-in-the-middle": 49,
39+
"ternary search": 53,
40+
"shortest paths": 262,
41+
"flows": 143,
42+
"*special problem": 437,
43+
"graph matchings": 89,
44+
"schedules": 10,
45+
"expression parsing": 36,
46+
"chinese remainder theorem": 16
47+
}
48+
49+
Total problems:- 9678
50+
Total types of problems:- 26744
51+
```
52+
use ```findproblems.py``` to find regarding your rating and topics
53+
```main.py```file is for analysis of problemset.
54+
55+
56+
457
### Implemented Features
558
- Random headers for requests
659
- Storage in MySQL database
@@ -21,3 +74,7 @@ This is a Scrapy-based web scraper specifically designed for extracting detailed
2174
2275
> [!WARNING]
2376
> - Be cautious when using this scraper as Codeforces may block your IP address. Consider using a rotating proxy for each request by adding the line `meta={"proxy":"protocol://yourusername:yourpassword@domainname:port/"}` in the `spiders/cf.py` file, specifically on line 27.
77+
78+
79+
> [!TIP]
80+
>

findproblems.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
import main
2+
problem_set = main.process_json_file()
3+
4+
# Modify these list to find your required problems range
5+
find_on_tags = ["bitmasks"]
6+
find_on_rating = ["800"]
7+
8+
9+
problems_found = 0
10+
problems = []
11+
for x in problem_set:
12+
for tag in find_on_tags:
13+
if tag in x["tags"]:
14+
for rating in find_on_rating:
15+
if rating == x['problem_rating']:
16+
problems.append(x["url"])
17+
print(x['url'])
18+
problems_found+=1
19+
20+
print(f"Total problems found {problems_found}")

main.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
import json
2+
# /////////////////////////////////
3+
# This is name of file that will be processed
4+
file_path = "output.json"
5+
# /////////////////////////////////
6+
7+
# make list out of output.json file
8+
def process_json_file():
9+
encodings_to_try = ['utf-8', 'iso-8859-1', 'windows-1252']
10+
for encoding in encodings_to_try:
11+
try:
12+
with open(file_path, 'r', encoding=encoding) as file:
13+
file_content = file.read()
14+
file_content = file_content.strip()
15+
data = json.loads(file_content)
16+
print(f"Successfully loaded JSON data from {file_path} using {encoding} encoding")
17+
return data
18+
except UnicodeDecodeError:
19+
print(f"Couldn't decode with {encoding}, trying next encoding...")
20+
except json.JSONDecodeError as e:
21+
print(f"Error decoding JSON data: {e}")
22+
print("Hint: Make sure the file contains valid JSON. Each object should be separated by commas.")
23+
return None
24+
except Exception as e:
25+
print(f"An unexpected error occurred: {e}")
26+
return None
27+
28+
print("Error: Unable to decode the file with any of the attempted encodings.")
29+
return None
30+
31+
32+
if __name__ == "__main__":
33+
# Make list from file 'output.json'
34+
problem_list = process_json_file()
35+
36+
# map to store the tags and number of problems
37+
problem_tags = {}
38+
39+
# Count topics and no. of problems in the list
40+
total_problems = 0
41+
sum_of_types = 0
42+
for x in problem_list:
43+
for tag in x["tags"]:
44+
if tag in problem_tags:
45+
problem_tags[tag] += 1
46+
else:
47+
problem_tags[tag] = 1
48+
sum_of_types+=1
49+
total_problems+=1
50+
51+
52+
# make the dict to json format with indentation of 4
53+
json_str = json.dumps(problem_tags, indent=4)
54+
print(json_str)
55+
56+
print(f"Total problems {total_problems}")
57+
print(f"Total types of problems {sum_of_types}")
58+

0 commit comments

Comments
 (0)