diff --git a/src/checkstyle/com/puppycrawl/tools/checkstyle/checks/duplicates/StrictDuplicateCodeCheck.java b/src/checkstyle/com/puppycrawl/tools/checkstyle/checks/duplicates/StrictDuplicateCodeCheck.java new file mode 100644 index 000000000..72928954c --- /dev/null +++ b/src/checkstyle/com/puppycrawl/tools/checkstyle/checks/duplicates/StrictDuplicateCodeCheck.java @@ -0,0 +1,422 @@ +//////////////////////////////////////////////////////////////////////////////// +// checkstyle: Checks Java source code for adherence to a set of rules. +// Copyright (C) 2001-2003 Oliver Burn +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +//////////////////////////////////////////////////////////////////////////////// +package com.puppycrawl.tools.checkstyle.checks.duplicates; + +import java.io.File; +import java.io.IOException; +import java.util.Arrays; + +import com.puppycrawl.tools.checkstyle.api.AbstractFileSetCheck; +import com.puppycrawl.tools.checkstyle.api.Utils; +import com.puppycrawl.tools.checkstyle.api.MessageDispatcher; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +/** + * Checks for duplicate code. + * + *
+ * There are many approaches for detecting duplicate code. Some involve + * parsing a file of a programming language and analyzing the source trees + * of all files. This is a very powerful approach for a specific programming + * language (such as Java), as it can potentially even detect duplicate code + * where linebreaks have been changed, variables have been renamed, etc. + *
+ *+ * This copy and paste detection implementation works differently. + * It cannot detect copy and paste code where the author deliberately + * tries to hide his copy+paste action. Instead it focusses on the standard + * corporate problem of reuse by copy and paste. Usually this leaves linebreaks + * and variable names intact. Since we do not need to analyse a parse tree + * our tool is not tied to a particular programming language. + *
+ *+ * We aim to achieve a performance on par with the + * PMD Copy+Paste detection tool, + * but with significantly lower memory requirements. + *
+ *+ * Simian + * is a very good commercial duplicate code detection tool. It comes with + * a Checkstyle module, so we encourage all users to evaluate Simian + * as an alternative to this check. + *
+ * + * @author Lars Kühne + */ +public final class StrictDuplicateCodeCheck extends AbstractFileSetCheck +{ + /** + * Converts each of the original source lines + * to a checksum that is checked against to find duplicates + */ + private interface ChecksumGenerator + { + /** + * Convert each of the original source lines + * to a checksum that is checked against to find duplicates + * Typically this involves stripping whitespace. + * @param aOriginalLines the original lines as they appear in the source + * @return the converted line or null if the line should be ignored + */ + long[] convertLines(String[] aOriginalLines); + } + + /** + * Calculates checksums for java source files. + * Removes leading and trainling whitespace and + * ignores imports. + */ + private class JavaChecksumGenerator implements ChecksumGenerator + { + /** @see ChecksumGenerator#convertLines */ + public long[] convertLines(String[] aOriginalLines) + { + // TODO: return IGNORE for lines in the header comment? + // That would require some simple parsing... + + // we could also parse the java code using the TreeWalker + // and then ignore everything before the CLASS_DEF... + + long[] checkSums = new long[aOriginalLines.length]; + for (int i = 0; i < aOriginalLines.length; i++) { + String line = aOriginalLines[i].trim(); + checkSums[i] = calcChecksum(line); + } + return checkSums; + } + + /** + * computes a checksum for a aLine. to avoid false alarms it is + * important that different lines result in different checksums. + * @param aLine the aLine + * @return checksum + */ + private long calcChecksum(String aLine) + { + if (aLine.startsWith("import ")) { + return IGNORE; + } + else { + return reallyCalcChecksum(aLine); + } + } + + /** + * Does the dirty work for computing a checksum for a aLine. + * @param aLine the aLine + * @return checksum + */ + private long reallyCalcChecksum(String aLine) + { + // important that it's larger than the length of most lines + // see http://www.utm.edu/research/primes/lists/small/1000.txt + final int bigPrime = 317; + + // TODO: Not sure that this algorithm makes it + // sufficiently improbable to get false alarms + long result = 0; + for (int i = 0; i < aLine.length(); i++) { + long c = aLine.charAt(i); + long idx = i; + result += bigPrime * idx + c; + } + return result; + } + } + + /** a jakarta commons log */ + private static final Log LOG = + LogFactory.getLog(StrictDuplicateCodeCheck.class); + + /** the checksum value to use for lines that should be ignored */ + private static final long IGNORE = Long.MIN_VALUE; + + /** default value for mMin */ + private static final int DEFAULT_MIN_DUPLICATE_LINES = 12; + + /** number of lines that have to be idential for reporting duplicates */ + private int mMin = DEFAULT_MIN_DUPLICATE_LINES; + + /** the checksums of all files that are currently checked */ + private long[][] mLineChecksums; + + /** helper to speed up searching algorithm */ + private long[][] mSortedRelevantChecksums; + + /** files that are currently checked */ + private File[] mFiles; + + // fields required only for statistics + + /** total number of duplicates found */ + private int mDuplicates; + + /** lines of code that have been checked */ + private int mLoc; + + /** number of chache misses */ + private long mCacheMisses = 0; + + /** number of cache hits */ + private long mCacheHits = 0; + + /** Creates a new instance of this class. */ + public StrictDuplicateCodeCheck() + { + setFileExtensions(new String[]{"java"}); + } + + /** + * @see com.puppycrawl.tools.checkstyle.api.AbstractFileSetCheck#process + */ + public synchronized void process(File[] aFiles) + { + long start = System.currentTimeMillis(); + mLoc = 0; + mDuplicates = 0; + mFiles = aFiles; + mLineChecksums = new long[mFiles.length][]; + mSortedRelevantChecksums = new long[mFiles.length][]; + + if (LOG.isDebugEnabled()) { + LOG.debug("Reading input files"); + } + + for (int i = 0; i < aFiles.length; i++) { + try { + File file = mFiles[i]; + String[] lines = Utils.getLines(file.getPath()); + ChecksumGenerator transformer = findChecksumGenerator(file); + mLineChecksums[i] = transformer.convertLines(lines); + } + catch (IOException ex) { + ex.printStackTrace(); // TODO + } + } + fillSortedRelevantChecksums(); + + long endReading = System.currentTimeMillis(); + findDuplicates(); + long endSearching = System.currentTimeMillis(); + + dumpStats(start, endReading, endSearching); + + mLineChecksums = null; + mSortedRelevantChecksums = null; + } + + /** + * Finds the Checksum generator for a given file. + * + * @param aFile the file to check for duplicates + * @return a generator to use for aFile + */ + private ChecksumGenerator findChecksumGenerator(File aFile) + { + if (aFile.getName().endsWith(".java")) { + return new JavaChecksumGenerator(); + } + else { + throw new IllegalArgumentException( + "Non-Java files are currently not supported " + + "(for no particular reason)"); + } + } + + /** + * Dump out statistics data on stderr. + * @param aStart start time + * @param aEndReading end time of reading phsical files + * @param aEndSearching end time duplicate analysis + */ + private void dumpStats(long aStart, long aEndReading, long aEndSearching) + { + if (LOG.isDebugEnabled()) { + final long cacheLookups = mCacheHits + mCacheMisses; + final long initTime = aEndReading - aStart; + final long workTime = aEndSearching - aEndReading; + LOG.debug("cache hits = " + mCacheHits + "/" + cacheLookups); + LOG.debug("files = " + mFiles.length); + LOG.debug("loc = " + mLoc); + LOG.debug("duplicates = " + mDuplicates); + LOG.debug("Runtime = " + initTime + " + " + workTime); + } + } + + /** + * filters and sorts the relevant lines and stores the result + * in sortedRelevantChecksums during the setup phase. + * That data is later used in a binary search to find out + * if it is worth investigating a file for duplicates of a block. + * If one of the lines in the block does not occur in the other file + * at all, we can skip that file quickly. + */ + private void fillSortedRelevantChecksums() + { + for (int i = 0; i < mLineChecksums.length; i++) { + int count = 0; + long[] checksums = mLineChecksums[i]; + long[] relevant = new long[checksums.length]; + for (int j = 0; j < checksums.length; j++) { + long checksum = checksums[j]; + if (checksum != IGNORE) { + relevant[count++] = checksum; + } + } + Arrays.sort(relevant, 0, count); + long[] result = new long[count]; + System.arraycopy(relevant, 0, result, 0, count); + mSortedRelevantChecksums[i] = result; + } + } + + /** + * finds duplicate lines in mFiles, + * using a textsearch algorithm to find reoccuring + * patters in the lineChecksums. + */ + private void findDuplicates() + { + if (LOG.isDebugEnabled()) { + LOG.debug("Analysis phase"); + } + + // It's been a while since my CS degree, but I think this is + // somewhere near O(mMax * LOC^2). + + // It may be possible to do this *much* smarter, + // but I don't have the Knuth bible at hand right now :-) + + // OK, prepare for some nested loops... :-( + + for (int i = 0; i < mFiles.length; i++) { + + final String path = mFiles[i].getPath(); + + getMessageCollector().reset(); + MessageDispatcher dispatcher = getMessageDispatcher(); + dispatcher.fireFileStarted(path); + + mLoc += mLineChecksums[i].length; + for (int j = 0; j < i; j++) { + findDuplicatesInFiles(i, j); + } + + fireErrors(path); + dispatcher.fireFileFinished(path); + } + } + + /** + * Compare two files and search for duplicates. + * @param aI mLineChecksums index of the first file to compare + * @param aJ mLineChecksums index of the seconds file to compare + */ + private void findDuplicatesInFiles(int aI, int aJ) + { + final int iFileLength = mLineChecksums[aI].length; + + // build up some supporting data structures + final boolean[] iLineOccurInJ = new boolean[iFileLength]; + for (int iLine = 0; iLine < iFileLength; iLine++) { + iLineOccurInJ[iLine] = (Arrays.binarySearch( + mSortedRelevantChecksums[aJ], mLineChecksums[aI][iLine]) >= 0); + } + + // go through all the lines in iFile and check if the following + // mMin lines occur in jFile + for (int iLine = 0; iLine < iFileLength - mMin; iLine++) { + + // fast exit if one of the lines does not occur in jFile at all + boolean fastExit = false; + final int kLimit = iFileLength - iLine; + for (int k = 0; k < Math.min(mMin, kLimit); k++) { + if (!iLineOccurInJ[iLine + k]) { + fastExit = true; + break; + } + } + + if (!fastExit) { + // all lines do occur -> brute force searching + mCacheMisses += 1; + iLine = findDuplicateFromLine(aI, aJ, iLine); + } + else { + mCacheHits += 1; + } + } + } + + /** + * Find and report a duplicate of the code starting from line aILine + * in file aI in the file aJ + * @param aI index of file that contains the candidate code + * @param aJ index of file that is searched for a dup of the candidate + * @param aILine starting line of the candidate in aI + * @return the next line in file i where + * starting to search will make sense + */ + private int findDuplicateFromLine(int aI, int aJ, int aILine) + { + // Using something more advanced like Boyer-Moore might be a + // good idea... + + final int iFileLength = mLineChecksums[aI].length; + final int jFileLength = mLineChecksums[aJ].length; + + for (int jLine = 0; jLine < jFileLength - mMin; jLine++) { + int equivalent = 0; + while (aILine + equivalent < iFileLength + && jLine + equivalent < jFileLength + && mLineChecksums[aI][aILine + equivalent] != IGNORE + && mLineChecksums[aI][aILine + equivalent] + == mLineChecksums[aJ][jLine + equivalent]) + { + equivalent += 1; + } + if ((aI != aJ || aILine != jLine) && equivalent >= mMin) { + reportDuplicate( + equivalent, aILine, mFiles[aJ], jLine); + aILine += equivalent; // skip to end of equivalent section + } + } + return aILine; + } + + /** + * Dumps out a duplicate report. + * @param aEquivalent number of equivalent lines + * @param aILine location of duplicate code + * within file that is currently checked + * @param aJFile the other file that contains the duplicate + * @param aJLine location of duplicate code within aJFile + */ + private void reportDuplicate( + int aEquivalent, int aILine, File aJFile, int aJLine) + { + final Integer dupLines = new Integer(aEquivalent); + final Integer startLine = new Integer(aJLine); + log(aILine, "duplicates.lines", + new Object[]{dupLines, aJFile, startLine}); + mDuplicates += 1; + } + +} diff --git a/src/checkstyle/com/puppycrawl/tools/checkstyle/checks/duplicates/messages.properties b/src/checkstyle/com/puppycrawl/tools/checkstyle/checks/duplicates/messages.properties new file mode 100644 index 000000000..1fbb4e210 --- /dev/null +++ b/src/checkstyle/com/puppycrawl/tools/checkstyle/checks/duplicates/messages.properties @@ -0,0 +1 @@ +duplicates.lines=Found duplicate of {0} lines in {1}, starting from line {2} \ No newline at end of file diff --git a/src/checkstyle/com/puppycrawl/tools/checkstyle/checks/duplicates/package.html b/src/checkstyle/com/puppycrawl/tools/checkstyle/checks/duplicates/package.html new file mode 100644 index 000000000..c4b0deca7 --- /dev/null +++ b/src/checkstyle/com/puppycrawl/tools/checkstyle/checks/duplicates/package.html @@ -0,0 +1,38 @@ + + +Checks that search for duplicate code. ++There are many trade-offs when writing a duplicate code detection tool. +Some of the conflicting goals are: +
+StrictDuplicateCodeCheck is fast enough to facilitate checking very large code +bases in acceptable time (minutes). It consumes very little memory, false +alarms are not impossible but a really rare case. While it supports multiple +languages it does not support fuzzy matches (that's why it's called Strict). +
++Note that there are brilliant commercial implementations of duplicate code +detection tools. One that is particularly noteworthy is +Simian +from RedHill Consulting, Inc. +
++Simian has managed to find a very good balance of the above tradeoffs. +It is superior to the checks in this package in many repects. +Simian is reasonably priced (free for noncommercial projects) +and includes a Checkstyle plugin. + +We encourage all users of Checkstyle to evaluate Simian as an +alternative to the Checks we offer in our distribution. + +
+ + \ No newline at end of file