Skip to content

Commit d804d5c

Browse files
iwishiwasaneagleplatisd
authored andcommitted
Implement removal of comments and docstrings
1 parent 79f2389 commit d804d5c

File tree

4 files changed

+48
-4
lines changed

4 files changed

+48
-4
lines changed

action.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,10 @@ inputs:
3131
description: 'The maximum allowed similarity percentage before the action warns'
3232
required: false
3333
default: 100
34+
only_code:
35+
description: "Removes comments and docstrings from the source code before analysis"
36+
required: false
37+
default: false
3438
runs:
3539
using: 'docker'
3640
image: 'Dockerfile'

duplicate_code_detection.py

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88
import sys
99
import argparse
1010
import gensim
11+
import ast
12+
import astor
13+
import re
1114
import tempfile
1215
import json
1316
from enum import Enum
@@ -55,6 +58,37 @@ def conditional_print(text, machine_friendly_output):
5558
if not machine_friendly_output:
5659
print(text)
5760

61+
def remove_comments_and_docstrings(source_code: str) -> str:
62+
"""Strip comments and docstrings from source code
63+
64+
.. seealso::
65+
66+
https://gist.github.com/phpdude/1ae6f19de213d66286c8183e9e3b9ec1
67+
68+
:param source_code: Raw source code as a single string
69+
:type source_code: str
70+
:return: Stripped source code as a single string
71+
:rtype: str
72+
"""
73+
parsed = ast.parse(source_code)
74+
for node in ast.walk(parsed):
75+
if not isinstance(node, (ast.FunctionDef, ast.ClassDef, ast.AsyncFunctionDef, ast.Module)):
76+
continue
77+
78+
if not len(node.body):
79+
continue
80+
81+
if not isinstance(node.body[0], ast.Expr):
82+
continue
83+
84+
if not hasattr(node.body[0], 'value') or not isinstance(node.body[0].value, ast.Str):
85+
continue
86+
87+
node.body = node.body[1:]
88+
89+
source_code_clean = astor.to_source(parsed)
90+
return source_code_clean
91+
5892

5993
def main():
6094
parser_description = CliColors.HEADER + CliColors.BOLD + \
@@ -78,17 +112,18 @@ def main():
78112
help="File extensions to check for similarities.")
79113
parser.add_argument("--ignore-threshold", type=int, default=0,
80114
help="Don't print out similarity below the ignore threshold")
115+
parser.add_argument("--only-code", action="store_true", help="Removes comments and docstrings from the source code before analysis")
81116
args = parser.parse_args()
82117

83118
result = run(args.fail_threshold, args.directories, args.files, args.ignore_directories,
84119
args.ignore_files, args.json, args.project_root_dir, args.file_extensions,
85-
args.ignore_threshold)
120+
args.ignore_threshold, args.only_code)
86121

87122
return result
88123

89124

90125
def run(fail_threshold, directories, files, ignore_directories, ignore_files,
91-
json_output, project_root_dir, file_extensions, ignore_threshold):
126+
json_output, project_root_dir, file_extensions, ignore_threshold, only_code,):
92127
# Determine which files to compare for similarities
93128
source_code_files = list()
94129
files_to_ignore = list()
@@ -139,7 +174,10 @@ def run(fail_threshold, directories, files, ignore_directories, ignore_files,
139174
# read file but also recover from encoding errors in source files
140175
with open(source_code_file, 'r', errors='surrogateescape') as f:
141176
# Store source code with the file path as the key
142-
source_code[source_code_file] = f.read()
177+
content = f.read()
178+
if only_code:
179+
content = remove_comments_and_docstrings(content)
180+
source_code[source_code_file] = content
143181
except Exception as err:
144182
print(f'ERROR: Failed to open file {source_code_file}, reason: {str(err)}')
145183

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
gensim>=3.8
22
nltk>=3.5
3+
astor>=0.8.1

run_action.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ def main():
9999
project_root_dir = os.environ.get('INPUT_PROJECT_ROOT_DIR')
100100
file_extensions = os.environ.get('INPUT_FILE_EXTENSIONS')
101101
ignore_threshold = os.environ.get('INPUT_IGNORE_BELOW')
102+
only_code = os.environ.get('INPUT_ONLY_CODE')
102103

103104
directories_list = split_and_trim(directories)
104105
directories_list = to_absolute_path(directories_list)
@@ -115,7 +116,7 @@ def main():
115116
detection_result, code_similarity = duplicate_code_detection.run(int(fail_threshold), directories_list, files_list,
116117
ignore_directories_list, ignore_files_list,
117118
json_output, project_root_dir, file_extensions_list,
118-
int(ignore_threshold))
119+
int(ignore_threshold), bool(only_code))
119120

120121
if detection_result == duplicate_code_detection.ReturnCode.BAD_INPUT:
121122
print("Action aborted due to bad user input")

0 commit comments

Comments
 (0)