Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement RKR-GST algorithm #272

Draft
wants to merge 2 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 80 additions & 0 deletions lib/java/PlagiarismDetection/src/pd/RKHasher.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/*
This file is part of SSID.

SSID is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

SSID is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public License
along with SSID. If not, see <http://www.gnu.org/licenses/>.
*/

package pd;


import java.util.ArrayList;
import java.util.List;
import pd.utils.NGrams.NGramList;

// This class allows pre-computing and querying of hash values of the windows [i..i + windowSize - 1] for all i within
// bound for a given NGramList, using a modified version of Rabin-Karp algorithm as described in
// https://brilliant.org/wiki/rabin-karp-algorithm/.
public class RKHasher {
private int windowSize;
private List<Integer> precomputedHashes;
private int base = 100007;

/**
* Constructor.
*
* @param windowSize size of window
*/
public RKHasher(int windowSize, NGramList ngList) {
this.windowSize = windowSize;
precomputeHashesForNGramList(ngList);
}

/**
*
* Calculates hash values of the windows ngList[i..i + windowSize - 1] for all i within bound.
*
*/
public void precomputeHashesForNGramList(NGramList ngList) {
if (ngList.size() == 0) return;

precomputedHashes = new ArrayList<Integer>();

int currentHash = 0;
int windowStart = 0;
int windowEnd = windowSize - 1;

// Computes first hash value for ngList[0,..,windowSize - 1]
for (int i = windowStart; i <= windowEnd; i++) {
int ngHashValue = ngList.get(i).nGramHash().intValue() % base;
currentHash += ngHashValue * Math.pow(base, windowSize - i);
}
precomputedHashes.add(currentHash);

// Moves window forward and calculate the rest of the desired hash values
for (windowStart = 1, windowEnd = windowSize; windowEnd < ngList.size(); windowStart++, windowEnd++) {
int toRemove = ngList.get(windowStart - 1).nGramHash().intValue() % base;
int toAdd = ngList.get(windowEnd).nGramHash().intValue() % base;

currentHash -= toRemove;
currentHash *= base;
currentHash += toAdd*Math.pow(base, windowSize);
precomputedHashes.add(currentHash);
}
}

// Returns hash value of window starting from ngIndex.
public int getHashStartingFrom(int ngIndex) {
return precomputedHashes.get(ngIndex);
}
}
127 changes: 96 additions & 31 deletions lib/java/PlagiarismDetection/src/pd/SimComparer.java
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,7 @@ private Result compareSubmissions(Submission s1, Submission s2,
gst(s1NGrams, s2.getNGramIndexingTable(),
skeleton.getNGramIndexingTable(), s1.getCodeTokens(),
s2.getCodeTokens(), skeleton.getCodeTokens(), minMatch,
result.getTokenIndexMappings());
result.getTokenIndexMappings(), s1, s2, skeleton);

getCodeMapping(s1.getCodeTokens(), s2.getCodeTokens(), result);

Expand Down Expand Up @@ -441,61 +441,124 @@ private void gst(NGramList s1NGrams,
HashMap<NGram, ArrayList<Integer>> s2NGramIndices,
HashMap<NGram, ArrayList<Integer>> bNGramIndices,
TokenList s1Tokens, TokenList s2Tokens, TokenList bTokens,
int minMatch, ArrayList<Mapping> tokenMappings) {
int minMatch, ArrayList<Mapping> tokenMappings, Submission s1, Submission s2,
Submission skeleton) {

int nGramSize = s1NGrams.size() > 0 ? NGram.getNGramSize() : 0;

ArrayList<Mapping> mappings;

// Use int[1] instead of int so that we can use maxMatch as a global object that can be passed around
int[] maxMatch = new int[1];

do {
maxMatch[0] = minMatch;
mappings = new ArrayList<Mapping>();
mappings = gstPhase1(s1Tokens, s1NGrams, s2NGramIndices, s2Tokens,
maxMatch, mappings);
maxMatch, mappings, s1, s2, skeleton, minMatch);

gstPhase2(mappings, s1Tokens, nGramSize, s1NGrams, s2Tokens,
bTokens, bNGramIndices, minMatch, tokenMappings);
bTokens, bNGramIndices, minMatch, tokenMappings, s1, s2, skeleton);

} while (maxMatch[0] > minMatch);
}

private ArrayList<Mapping> gstPhase1(TokenList s1Tokens,
NGramList s1NGrams,
HashMap<NGram, ArrayList<Integer>> s2NGramIndices,
TokenList s2Tokens, int[] maxMatch, ArrayList<Mapping> mappings) {

// curCSMapped = current countable statement mapped
// curNCSMapped = current non-countable statement mapped
int s1EndIndex, s2EndIndex, curCSMapped, curNCSMapped, curTotalStatementMapped, s1EndOfStmtIndex, s2StartIndex, s2EndOfStmtIndex;
TokenList s2Tokens, int[] maxMatch, ArrayList<Mapping> mappings,
Submission s1, Submission s2, Submission skeleton, int minMatch) {

NGramList s1NGramsStartingStmtsList;
NGramList s2NGramsStartingStmtsList;
s1NGramsStartingStmtsList = s1.getNGramsStartingStmtsList();
s2NGramsStartingStmtsList = s2.getNGramsStartingStmtsList();

int s1EndIndex, s2EndIndex, curCSMapped, curNCSMapped, curTotalStatementMapped,
s1EndOfStmtIndex, s2StartIndex, s2EndOfStmtIndex;
TokenSSID s1Token, s2Token;
HashMap<Integer, Integer> s2Matches;
ArrayList<Integer> s2Indices;
NGram s1NGram;

for (int s1StartIndex : s1Tokens.getStartOfStmtTokenIndices()) {
if (s1NGrams.size() <= s1StartIndex
RKHasher RKHasher1 = new RKHasher(minMatch, s1NGramsStartingStmtsList);
RKHasher RKHasher2 = new RKHasher(minMatch, s2NGramsStartingStmtsList);
Map<Integer, ArrayList<Integer>> precomputedHashMap = new HashMap<Integer, ArrayList<Integer>>();


// Get index of first unmarked NGram in s2NGramsStartingStmtsList.
// This is done by checking first token of the NGram under consideration to determine whether or not that NGram
// is marked. We are keeping track of markings using TokenList and each statement (whose the Ngram under
// consideration belongs to) is marked as a whole, so that is sufficient.
int firstUnmarkedNGramIdxS2NGramsStartingStmtsList = 0;
for (int i = 0; i < s2NGramsStartingStmtsList.size(); i++) {
int tokenIdx = s2.getTokenIndexOfLoc(s2NGramsStartingStmtsList.get(i).codeStartIndex());

if (!s2Tokens.isTokenMarked(tokenIdx)) {
firstUnmarkedNGramIdxS2NGramsStartingStmtsList = i;
break;
}
}

// Terminates if unmarked index found is too close to end of list and thus we can't find any more match
if (firstUnmarkedNGramIdxS2NGramsStartingStmtsList + minMatch > s2NGramsStartingStmtsList.size() - 1) {
return mappings;
}

for (int s2NGramIndex = firstUnmarkedNGramIdxS2NGramsStartingStmtsList; s2NGramIndex < s2NGramsStartingStmtsList.size() - minMatch; s2NGramIndex++) {
boolean isMMLRangeAllUnmarked = true;
for (int i = s2NGramIndex; i < s2NGramIndex + minMatch; i++) {
int codeStartIndex = s2.getTokenIndexOfLoc(s2NGramsStartingStmtsList.get(i).codeStartIndex());
if (s1Tokens.isTokenMarked(codeStartIndex)) {
isMMLRangeAllUnmarked = false;
break;
}
}
if (!isMMLRangeAllUnmarked) continue;

int hashStartFromIdx2 = RKHasher2.getHashStartingFrom(s2NGramIndex);

if (!precomputedHashMap.containsKey(hashStartFromIdx2)) {
ArrayList<Integer> positions = new ArrayList<>();
positions.add(s2.getTokenIndexOfLoc(s2NGramsStartingStmtsList.get(s2NGramIndex).codeStartIndex()));
precomputedHashMap.put(hashStartFromIdx2, positions);
} else {
precomputedHashMap.get(hashStartFromIdx2).add(s2.getTokenIndexOfLoc(s2NGramsStartingStmtsList.get(s2NGramIndex).codeStartIndex()));
}
}

// Get index of first unmarked NGram in s1NGramsStartingStmtsList.
// This is done by checking first token of the NGram under consideration to determine whether or not that NGram
// is marked. We are keeping track of markings using TokenList and each statement (whose the Ngram under
// consideration belongs to) is marked as a whole, so that is sufficient.
int firstUnmarkedNGramIdxS1NGramsStartingStmtsList = 0;
for (int i = 0; i < s1NGramsStartingStmtsList.size(); i++) {
int tokenIdx = s1.getTokenIndexOfLoc(s1NGramsStartingStmtsList.get(i).codeStartIndex());
if (!s1Tokens.isTokenMarked(tokenIdx)) {
firstUnmarkedNGramIdxS1NGramsStartingStmtsList = i;
break;
}
}

// Terminates if unmarked index found is too close to end of list and thus we can't find any more match
if (firstUnmarkedNGramIdxS1NGramsStartingStmtsList + maxMatch[0] > s1NGramsStartingStmtsList.size() - 1) {
return mappings;
}

for (int s1NGramIndex = firstUnmarkedNGramIdxS1NGramsStartingStmtsList; s1NGramIndex < s1NGramsStartingStmtsList.size(); s1NGramIndex++) {
int s1StartIndex = s1.getTokenIndexOfLoc(s1NGramsStartingStmtsList.get(s1NGramIndex).codeStartIndex());

if (s1NGramsStartingStmtsList.size() <= s1NGramIndex
|| s1NGramIndex + maxMatch[0] > s1NGramsStartingStmtsList.size() - 1
|| s1Tokens.isTokenMarked(s1StartIndex)) {
continue;
}

s1NGram = s1NGrams.get(s1StartIndex);
// logger.debug("The n-gram is: {} ", s1NGram.getTokenList().toString());

if (s2NGramIndices.containsKey(s1NGram)) {
s2Indices = s2NGramIndices.get(s1NGram);
s2Matches = new HashMap<Integer, Integer>();
for (int index : s2Indices) {
if (s2Tokens.isTokenMarked(index)) {
continue;
}
s2Matches.put(index, index);
}
for (Map.Entry<Integer, Integer> s2Match : s2Matches.entrySet()) {
int hashStartingFromS1NGramIndex = RKHasher1.getHashStartingFrom(s1NGramIndex);

if (precomputedHashMap.containsKey(hashStartingFromS1NGramIndex)) {
for (int s2Match: precomputedHashMap.get(hashStartingFromS1NGramIndex)) {
s1EndIndex = s1StartIndex;
curCSMapped = 0;
curNCSMapped = 0;
s2EndIndex = s2Match.getValue();
s2EndIndex = s2Match;
s1EndOfStmtIndex = -1;
while (s1Tokens.size() > s1EndIndex
&& s2Tokens.size() > s2EndIndex
Expand All @@ -517,9 +580,11 @@ private ArrayList<Mapping> gstPhase1(TokenList s1Tokens,
}

curTotalStatementMapped = curCSMapped + curNCSMapped;

NGram s1NGram = s1NGramsStartingStmtsList.get(s1NGramIndex);
if (curTotalStatementMapped >= maxMatch[0]) {
s1Token = s1Tokens.get(s1EndOfStmtIndex);
s2StartIndex = s2Match.getKey();
s2StartIndex = s2Match;
s2EndOfStmtIndex = s1EndOfStmtIndex - s1StartIndex
+ s2StartIndex;
s2Token = s2Tokens.get(s2EndOfStmtIndex);
Expand Down Expand Up @@ -560,7 +625,7 @@ private void gstPhase2(ArrayList<Mapping> mappings, TokenList s1Tokens,
int nGramSize, NGramList s1NGrams, TokenList s2Tokens,
TokenList bTokens,
HashMap<NGram, ArrayList<Integer>> bNGramIndices, int minMatch,
ArrayList<Mapping> tokenMappings) {
ArrayList<Mapping> tokenMappings, Submission s1, Submission s2, Submission skeleton) {

NGramList s1RegionNGrams;
TokenList s1RegionTokens;
Expand Down Expand Up @@ -607,7 +672,7 @@ private void gstPhase2(ArrayList<Mapping> mappings, TokenList s1Tokens,
bMappings = new ArrayList<Mapping>();
if (bTokens != null) {
gst(s1RegionNGrams, bNGramIndices, null, s1RegionTokens,
bTokens, null, minMatch, bMappings);
bTokens, null, minMatch, bMappings, s1, skeleton, null);
mappedCountableStmt = m.getMappedCountableStmtCount();
// logger.debug("Mapping is: {} ", m.toString());

Expand Down
18 changes: 11 additions & 7 deletions lib/java/PlagiarismDetection/src/pd/utils/NGrams/NGram.java
Original file line number Diff line number Diff line change
Expand Up @@ -141,12 +141,16 @@ public String nGramValue() {
return nGram;
}

public BigInteger nGramHash() throws NoSuchAlgorithmException {
String nGram = nGramValue();

MessageDigest messageDigest = MessageDigest.getInstance("SHA-256");
byte[] digest = messageDigest.digest(nGram.getBytes(StandardCharsets.UTF_8));
BigInteger nbr = new BigInteger(1, digest);
return nbr;
public BigInteger nGramHash() {
try {
String nGram = nGramValue();

MessageDigest messageDigest = MessageDigest.getInstance("SHA-256");
byte[] digest = messageDigest.digest(nGram.getBytes(StandardCharsets.UTF_8));
BigInteger nbr = new BigInteger(1, digest);
return nbr;
} catch (NoSuchAlgorithmException e) {
return null;
}
}
}
15 changes: 15 additions & 0 deletions lib/java/PlagiarismDetection/src/pd/utils/Submission.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ public final class Submission {
private int codeLength = 0;
private TokenList codeTokens = new TokenList();
private NGramList nGramList = new NGramList();
private NGramList nGramsStartingStmtsList = new NGramList();
private HashMap<NGram, ArrayList<Integer>> nGramIndexingTable = new HashMap<NGram, ArrayList<Integer>>();
private boolean isSkeletonCode = false;

Expand Down Expand Up @@ -89,6 +90,10 @@ public NGramList getNGramList() {
return this.nGramList;
}

public NGramList getNGramsStartingStmtsList() {
return this.nGramsStartingStmtsList;
}

public void setNGramList(NGramList list) {
this.nGramList = list;

Expand All @@ -106,11 +111,21 @@ public void setNGramList(NGramList list) {
nGramIndexingTable.put(n, indices);
}
indices.add(i);
nGramsStartingStmtsList.add(n);
}
i++;
}
}

public int getTokenIndexOfLoc(int loc) {
for (int i=0; i< codeTokens.size(); i++) {
if (this.codeTokens.get(i).getCodeStartIndex() == loc) {
return i;
}
}
return -1;
}

public HashMap<NGram, ArrayList<Integer>> getNGramIndexingTable() {
return this.nGramIndexingTable;
}
Expand Down
Binary file modified lib/java/jar/PlagiarismDetection.jar
Binary file not shown.
Loading