added insights

vaibhavyadav-dev · vaibhavyadav-dev · commit 5151a503d422 · 2024-08-01T18:14:54.000Z
diff --git a/README.md b/README.md
@@ -1,6 +1,59 @@
 ## Codeforces Problemset Scrapper
 This is a Scrapy-based web scraper specifically designed for extracting detailed information about Codeforces problems. The Codeforces API lacks certain details such as URLs, number of people who have solved the problems, and problem ratings. This scraper solves that problem by extracting data for better problem analysis. 
 
+
+
+### Below are some insights from the data  
+Following topics are present (tags) in the problemset with number of problems.
+```
+{
+    "2-sat": 34,
+    "bitmasks": 559,
+    "data structures": 1693,
+    "dp": 2074,
+    "graphs": 1045,
+    "matrices": 119,
+    "two pointers": 528,
+    "strings": 714,
+    "brute force": 1638,
+    "constructive algorithms": 1706,
+    "greedy": 2782,
+    "implementation": 2681,
+    "math": 2817,
+    "number theory": 722,
+    "binary search": 1030,
+    "sortings": 1054,
+    "combinatorics": 659,
+    "games": 220,
+    "hashing": 206,
+    "interactive": 222,
+    "dfs and similar": 914,
+    "trees": 803,
+    "dsu": 346,
+    "divide and conquer": 283,
+    "fft": 92,
+    "geometry": 387,
+    "string suffix structures": 90,
+    "probabilities": 231,
+    "meet-in-the-middle": 49,
+    "ternary search": 53,
+    "shortest paths": 262,
+    "flows": 143,
+    "*special problem": 437,
+    "graph matchings": 89,
+    "schedules": 10,
+    "expression parsing": 36,
+    "chinese remainder theorem": 16
+}
+
+Total problems:- 9678
+Total types of problems:- 26744
+```
+use ```findproblems.py``` to find regarding your rating and topics  
+```main.py```file is for analysis of problemset.
+
+
+
 ### Implemented Features
 - Random headers for requests
 - Storage in MySQL database
@@ -21,3 +74,7 @@ This is a Scrapy-based web scraper specifically designed for extracting detailed
 
 > [!WARNING]
 > - Be cautious when using this scraper as Codeforces may block your IP address. Consider using a rotating proxy for each request by adding the line `meta={"proxy":"protocol://yourusername:yourpassword@domainname:port/"}` in the `spiders/cf.py` file, specifically on line 27.
+
+
+> [!TIP]
+> 
diff --git a/findproblems.py b/findproblems.py
@@ -0,0 +1,20 @@
+import main
+problem_set = main.process_json_file()
+
+# Modify these list to find your required problems range
+find_on_tags = ["bitmasks"]
+find_on_rating = ["800"]
+
+
+problems_found = 0
+problems = []
+for x in problem_set:
+    for tag in find_on_tags:
+        if tag in x["tags"]:
+            for rating in find_on_rating:
+                if rating == x['problem_rating']:
+                    problems.append(x["url"])
+                    print(x['url'])
+                    problems_found+=1
+
+print(f"Total problems found {problems_found}")
diff --git a/main.py b/main.py
@@ -0,0 +1,58 @@
+import json
+# /////////////////////////////////
+# This is name of file that will be processed
+file_path = "output.json"
+# /////////////////////////////////
+
+# make list out of output.json file
+def process_json_file():
+    encodings_to_try = ['utf-8', 'iso-8859-1', 'windows-1252']
+    for encoding in encodings_to_try:
+        try:
+            with open(file_path, 'r', encoding=encoding) as file:
+                file_content = file.read()
+                file_content = file_content.strip()
+                data = json.loads(file_content)
+                print(f"Successfully loaded JSON data from {file_path} using {encoding} encoding")
+                return data
+        except UnicodeDecodeError:
+            print(f"Couldn't decode with {encoding}, trying next encoding...")
+        except json.JSONDecodeError as e:
+            print(f"Error decoding JSON data: {e}")
+            print("Hint: Make sure the file contains valid JSON. Each object should be separated by commas.")
+            return None
+        except Exception as e:
+            print(f"An unexpected error occurred: {e}")
+            return None
+    
+    print("Error: Unable to decode the file with any of the attempted encodings.")
+    return None
+
+
+if __name__ == "__main__":
+    # Make list from file 'output.json'
+    problem_list = process_json_file()
+
+    # map to store the tags and number of problems
+    problem_tags = {}
+
+    # Count topics and no. of problems in the list
+    total_problems = 0
+    sum_of_types = 0
+    for x in problem_list:
+        for tag in x["tags"]:
+            if tag in problem_tags:
+                problem_tags[tag] += 1
+            else:
+                problem_tags[tag] = 1
+            sum_of_types+=1
+        total_problems+=1
+
+
+    # make the dict to json format with indentation of 4
+    json_str = json.dumps(problem_tags, indent=4)
+    print(json_str)
+
+    print(f"Total problems {total_problems}")
+    print(f"Total types of problems {sum_of_types}")
+