#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Check for cheating. Output file in format:
distance file1 file2
usage:
pygrade cheat --test <file> [--students <file>] [--output <file>] [--workdir <file>]
Options
-h, --help
-o, --output <file> Output file [default: cheats.tsv]
-s, --students <file> Students TSV file [default: students.tsv]
-t, --test <file> File containing python tests for grading
-w, --workdir <file> Temporary directory for storing assignments [default: students]
"""
from docopt import docopt
from itertools import combinations
import os
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances
from . import get_local_repo, read_assignment_metadata, read_students
[docs]def parse_assignments(students, test_path, path):
metadata = read_assignment_metadata(test_path)
assignment_subpaths = metadata['files_to_test']
strings = []
filenames = []
for s in students:
this_string = ''
repo = get_local_repo(s, path)
for assignment_subpath in assignment_subpaths:
fname = os.path.join(repo, assignment_subpath)
try:
src = '\n'.join(open(fname).readlines())
#strings.append(strip_comments(src))
#filenames.append(fname)
this_string += strip_comments(src)
except FileNotFoundError as e:
print('FileNotFound' + str(e))
pass
strings.append(this_string)
filenames.append(repo)
print('read %d files' % len(strings))
vec = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b')
X = vec.fit_transform(strings)
return X, filenames
[docs]def compare_assignments(students, test_path, path):
vectors, filenames = parse_assignments(students, test_path, path)
distances = pairwise_distances(vectors, metric='cosine')
distance_tuples = [(distances[i, j], filenames[i], filenames[j]) for i, j in combinations(range(len(filenames)), 2)]
return sorted(distance_tuples)
[docs]def write_output(results, out_path):
outf = open(out_path, 'w')
for r in results:
outf.write('%.4f\t%s\t%s\n' % (r[0], r[1], r[2]))
outf.close()
print('saved results in %s' % out_path)
[docs]def main():
args = docopt(__doc__)
path = args['--workdir']
print('working directory=%s' % path)
students = read_students(args['--students'])
print('read %d students' % len(students))
results = compare_assignments(students, args['--test'], path)
write_output(results, args['--output'])
if __name__ == '__main__':
main()