From 201e6c46cd9ff5b43e95da4c2d3550cfe057433a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lars=20K=C3=BChne?= <lakuehne@gmail.com>
Date: Sat, 12 Jul 2003 14:25:44 +0000
Subject: [PATCH] first cut at duplicate code detection tool. TBD: -
 performance optimizations - remove basedir from secondary file name

---
 .../duplicates/StrictDuplicateCodeCheck.java  | 422 ++++++++++++++++++
 .../checks/duplicates/messages.properties     |   1 +
 .../checkstyle/checks/duplicates/package.html |  38 ++
 3 files changed, 461 insertions(+)
 create mode 100644 src/checkstyle/com/puppycrawl/tools/checkstyle/checks/duplicates/StrictDuplicateCodeCheck.java
 create mode 100644 src/checkstyle/com/puppycrawl/tools/checkstyle/checks/duplicates/messages.properties
 create mode 100644 src/checkstyle/com/puppycrawl/tools/checkstyle/checks/duplicates/package.html
diff --git a/src/checkstyle/com/puppycrawl/tools/checkstyle/checks/duplicates/StrictDuplicateCodeCheck.java b/src/checkstyle/com/puppycrawl/tools/checkstyle/checks/duplicates/StrictDuplicateCodeCheck.java
new file mode 100644
index 000000000..72928954c
--- /dev/null
+++ b/src/checkstyle/com/puppycrawl/tools/checkstyle/checks/duplicates/StrictDuplicateCodeCheck.java
@@ -0,0 +1,422 @@
+////////////////////////////////////////////////////////////////////////////////
+// checkstyle: Checks Java source code for adherence to a set of rules.
+// Copyright (C) 2001-2003  Oliver Burn
+//
+// This library is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+////////////////////////////////////////////////////////////////////////////////
+package com.puppycrawl.tools.checkstyle.checks.duplicates;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Arrays;
+
+import com.puppycrawl.tools.checkstyle.api.AbstractFileSetCheck;
+import com.puppycrawl.tools.checkstyle.api.Utils;
+import com.puppycrawl.tools.checkstyle.api.MessageDispatcher;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+/**
+ * Checks for duplicate code.
+ *
+ * <p>
+ * There are many approaches for detecting duplicate code. Some involve
+ * parsing a file of a programming language and analyzing the source trees
+ * of all files. This is a very powerful approach for a specific programming
+ * language (such as Java), as it can potentially even detect duplicate code
+ * where linebreaks have been changed, variables have been renamed, etc.
+ * </p>
+ * <p>
+ * This copy and paste detection implementation works differently.
+ * It cannot detect copy and paste code where the author deliberately
+ * tries to hide his copy+paste action. Instead it focusses on the standard
+ * corporate problem of reuse by copy and paste. Usually this leaves linebreaks
+ * and variable names intact. Since we do not need to analyse a parse tree
+ * our tool is not tied to a particular programming language.
+ * </p>
+ * <p>
+ * We aim to achieve a performance on par with the
+ * <a href="http://pmd.sourceforge.net">PMD Copy+Paste detection tool</a>,
+ * but with significantly lower memory requirements.
+ * </p>
+ * <p>
+ * <a href="http://www.redhillconsulting.com.au/products/simian/">Simian</a>
+ * is a very good commercial duplicate code detection tool. It comes with
+ * a Checkstyle module, so we encourage all users to evaluate Simian
+ * as an alternative to this check.
+ * </p>
+ *
+ * @author Lars K&uuml;hne
+ */
+public final class StrictDuplicateCodeCheck extends AbstractFileSetCheck
+{
+    /**
+     * Converts each of the original source lines
+     * to a checksum that is checked against to find duplicates
+     */
+    private interface ChecksumGenerator
+    {
+        /**
+         * Convert each of the original source lines
+         * to a checksum that is checked against to find duplicates
+         * Typically this involves stripping whitespace.
+         * @param aOriginalLines the original lines as they appear in the source
+         * @return the converted line or null if the line should be ignored
+         */
+        long[] convertLines(String[] aOriginalLines);
+    }
+
+    /**
+     * Calculates checksums for java source files.
+     * Removes leading and trainling whitespace and
+     * ignores imports.
+     */
+    private class JavaChecksumGenerator implements ChecksumGenerator
+    {
+        /** @see ChecksumGenerator#convertLines */
+        public long[] convertLines(String[] aOriginalLines)
+        {
+            // TODO: return IGNORE for lines in the header comment?
+            // That would require some simple parsing...
+
+            // we could also parse the java code using the TreeWalker
+            // and then ignore everything before the CLASS_DEF...
+
+            long[] checkSums = new long[aOriginalLines.length];
+            for (int i = 0; i < aOriginalLines.length; i++) {
+                String line = aOriginalLines[i].trim();
+                checkSums[i] = calcChecksum(line);
+            }
+            return checkSums;
+        }
+
+        /**
+         * computes a checksum for a aLine. to avoid false alarms it is
+         * important that different lines result in different checksums.
+         * @param aLine the aLine
+         * @return checksum
+         */
+        private long calcChecksum(String aLine)
+        {
+            if (aLine.startsWith("import ")) {
+                return IGNORE;
+            }
+            else {
+                return reallyCalcChecksum(aLine);
+            }
+        }
+
+        /**
+         * Does the dirty work for computing a checksum for a aLine.
+         * @param aLine the aLine
+         * @return checksum
+         */
+        private long reallyCalcChecksum(String aLine)
+        {
+            // important that it's larger than the length of most lines
+            // see http://www.utm.edu/research/primes/lists/small/1000.txt
+            final int bigPrime = 317;
+
+            // TODO: Not sure that this algorithm makes it
+            // sufficiently improbable to get false alarms
+            long result = 0;
+            for (int i = 0; i < aLine.length(); i++) {
+                long c = aLine.charAt(i);
+                long idx = i;
+                result += bigPrime * idx + c;
+            }
+            return result;
+        }
+    }
+
+    /** a jakarta commons log */
+    private static final Log LOG =
+            LogFactory.getLog(StrictDuplicateCodeCheck.class);
+
+    /** the checksum value to use for lines that should be ignored */
+    private static final long IGNORE = Long.MIN_VALUE;
+
+    /** default value for mMin */
+    private static final int DEFAULT_MIN_DUPLICATE_LINES = 12;
+
+    /** number of lines that have to be idential for reporting duplicates */
+    private int mMin = DEFAULT_MIN_DUPLICATE_LINES;
+
+    /** the checksums of all files that are currently checked */
+    private long[][] mLineChecksums;
+
+    /** helper to speed up searching algorithm */
+    private long[][] mSortedRelevantChecksums;
+
+    /** files that are currently checked */
+    private File[] mFiles;
+
+    // fields required only for statistics
+
+    /** total number of duplicates found */
+    private int mDuplicates;
+
+    /** lines of code that have been checked */
+    private int mLoc;
+
+    /** number of chache misses */
+    private long mCacheMisses = 0;
+
+    /** number of cache hits */
+    private long mCacheHits = 0;
+
+    /** Creates a new instance of this class. */
+    public StrictDuplicateCodeCheck()
+    {
+        setFileExtensions(new String[]{"java"});
+    }
+
+    /**
+     * @see com.puppycrawl.tools.checkstyle.api.AbstractFileSetCheck#process
+     */
+    public synchronized void process(File[] aFiles)
+    {
+        long start = System.currentTimeMillis();
+        mLoc = 0;
+        mDuplicates = 0;
+        mFiles = aFiles;
+        mLineChecksums = new long[mFiles.length][];
+        mSortedRelevantChecksums = new long[mFiles.length][];
+
+        if (LOG.isDebugEnabled()) {
+            LOG.debug("Reading input files");
+        }
+
+        for (int i = 0; i < aFiles.length; i++) {
+            try {
+                File file = mFiles[i];
+                String[] lines = Utils.getLines(file.getPath());
+                ChecksumGenerator transformer = findChecksumGenerator(file);
+                mLineChecksums[i] = transformer.convertLines(lines);
+            }
+            catch (IOException ex) {
+                ex.printStackTrace(); // TODO
+            }
+        }
+        fillSortedRelevantChecksums();
+
+        long endReading = System.currentTimeMillis();
+        findDuplicates();
+        long endSearching = System.currentTimeMillis();
+
+        dumpStats(start, endReading, endSearching);
+
+        mLineChecksums = null;
+        mSortedRelevantChecksums = null;
+    }
+
+    /**
+     * Finds the Checksum generator for a given file.
+     *
+     * @param aFile the file to check for duplicates
+     * @return a generator to use for aFile
+     */
+    private ChecksumGenerator findChecksumGenerator(File aFile)
+    {
+        if (aFile.getName().endsWith(".java")) {
+            return new JavaChecksumGenerator();
+        }
+        else {
+            throw new IllegalArgumentException(
+                    "Non-Java files are currently not supported "
+                    + "(for no particular reason)");
+        }
+    }
+
+    /**
+     * Dump out statistics data on stderr.
+     * @param aStart start time
+     * @param aEndReading end time of reading phsical files
+     * @param aEndSearching end time duplicate analysis
+     */
+    private void dumpStats(long aStart, long aEndReading, long aEndSearching)
+    {
+        if (LOG.isDebugEnabled()) {
+            final long cacheLookups = mCacheHits + mCacheMisses;
+            final long initTime = aEndReading - aStart;
+            final long workTime = aEndSearching - aEndReading;
+            LOG.debug("cache hits = " + mCacheHits + "/" + cacheLookups);
+            LOG.debug("files = " + mFiles.length);
+            LOG.debug("loc = " + mLoc);
+            LOG.debug("duplicates = " + mDuplicates);
+            LOG.debug("Runtime = " + initTime + " + " + workTime);
+        }
+    }
+
+    /**
+     * filters and sorts the relevant lines and stores the result
+     * in sortedRelevantChecksums during the setup phase.
+     * That data is later used in a binary search to find out
+     * if it is worth investigating a file for duplicates of a block.
+     * If one of the lines in the block does not occur in the other file
+     * at all, we can skip that file quickly.
+     */
+    private void fillSortedRelevantChecksums()
+    {
+        for (int i = 0; i < mLineChecksums.length; i++) {
+            int count = 0;
+            long[] checksums = mLineChecksums[i];
+            long[] relevant = new long[checksums.length];
+            for (int j = 0; j < checksums.length; j++) {
+                long checksum = checksums[j];
+                if (checksum != IGNORE) {
+                    relevant[count++] = checksum;
+                }
+            }
+            Arrays.sort(relevant, 0, count);
+            long[] result = new long[count];
+            System.arraycopy(relevant, 0, result, 0, count);
+            mSortedRelevantChecksums[i] = result;
+        }
+    }
+
+    /**
+     * finds duplicate lines in mFiles,
+     * using a textsearch algorithm to find reoccuring
+     * patters in the lineChecksums.
+     */
+    private void findDuplicates()
+    {
+        if (LOG.isDebugEnabled()) {
+            LOG.debug("Analysis phase");
+        }
+
+        // It's been a while since my CS degree, but I think this is
+        // somewhere near O(mMax * LOC^2).
+
+        // It may be possible to do this *much* smarter,
+        // but I don't have the Knuth bible at hand right now :-)
+
+        // OK, prepare for some nested loops... :-(
+
+        for (int i = 0; i < mFiles.length; i++) {
+
+            final String path = mFiles[i].getPath();
+
+            getMessageCollector().reset();
+            MessageDispatcher dispatcher = getMessageDispatcher();
+            dispatcher.fireFileStarted(path);
+
+            mLoc += mLineChecksums[i].length;
+            for (int j = 0; j < i; j++) {
+                findDuplicatesInFiles(i, j);
+            }
+
+            fireErrors(path);
+            dispatcher.fireFileFinished(path);
+        }
+    }
+
+    /**
+     * Compare two files and search for duplicates.
+     * @param aI mLineChecksums index of the first file to compare
+     * @param aJ mLineChecksums index of the seconds file to compare
+     */
+    private void findDuplicatesInFiles(int aI, int aJ)
+    {
+        final int iFileLength = mLineChecksums[aI].length;
+
+        // build up some supporting data structures
+        final boolean[] iLineOccurInJ = new boolean[iFileLength];
+        for (int iLine = 0; iLine < iFileLength; iLine++) {
+            iLineOccurInJ[iLine] = (Arrays.binarySearch(
+                mSortedRelevantChecksums[aJ], mLineChecksums[aI][iLine]) >= 0);
+        }
+
+        // go through all the lines in iFile and check if the following
+        // mMin lines occur in jFile
+        for (int iLine = 0; iLine < iFileLength - mMin; iLine++) {
+
+            // fast exit if one of the lines does not occur in jFile at all
+            boolean fastExit = false;
+            final int kLimit = iFileLength - iLine;
+            for (int k = 0; k < Math.min(mMin, kLimit); k++) {
+                if (!iLineOccurInJ[iLine + k]) {
+                    fastExit = true;
+                    break;
+                }
+            }
+
+            if (!fastExit) {
+                // all lines do occur -> brute force searching
+                mCacheMisses += 1;
+                iLine = findDuplicateFromLine(aI, aJ, iLine);
+            }
+            else {
+                mCacheHits += 1;
+            }
+        }
+    }
+
+    /**
+     * Find and report a duplicate of the code starting from line aILine
+     * in file aI in the file aJ
+     * @param aI index of file that contains the candidate code
+     * @param aJ index of file that is searched for a dup of the candidate
+     * @param aILine starting line of the candidate in aI
+     * @return the next line in file i where
+     * starting to search will make sense
+     */
+    private int findDuplicateFromLine(int aI, int aJ, int aILine)
+    {
+        // Using something more advanced like Boyer-Moore might be a
+        // good idea...
+
+        final int iFileLength = mLineChecksums[aI].length;
+        final int jFileLength = mLineChecksums[aJ].length;
+
+        for (int jLine = 0; jLine < jFileLength - mMin; jLine++) {
+            int equivalent = 0;
+            while (aILine + equivalent < iFileLength
+                    && jLine + equivalent < jFileLength
+                    && mLineChecksums[aI][aILine + equivalent] != IGNORE
+                    && mLineChecksums[aI][aILine + equivalent]
+                       == mLineChecksums[aJ][jLine + equivalent])
+            {
+                equivalent += 1;
+            }
+            if ((aI != aJ || aILine != jLine) && equivalent >= mMin) {
+                reportDuplicate(
+                        equivalent, aILine, mFiles[aJ], jLine);
+                aILine += equivalent; // skip to end of equivalent section
+            }
+        }
+        return aILine;
+    }
+
+    /**
+     * Dumps out a duplicate report.
+     * @param aEquivalent number of equivalent lines
+     * @param aILine location of duplicate code
+     * within file that is currently checked
+     * @param aJFile the other file that contains the duplicate
+     * @param aJLine location of duplicate code within aJFile
+     */
+    private void reportDuplicate(
+            int aEquivalent, int aILine, File aJFile, int aJLine)
+    {
+        final Integer dupLines = new Integer(aEquivalent);
+        final Integer startLine = new Integer(aJLine);
+        log(aILine, "duplicates.lines",
+                new Object[]{dupLines, aJFile, startLine});
+        mDuplicates += 1;
+    }
+
+}
diff --git a/src/checkstyle/com/puppycrawl/tools/checkstyle/checks/duplicates/messages.properties b/src/checkstyle/com/puppycrawl/tools/checkstyle/checks/duplicates/messages.properties
new file mode 100644
index 000000000..1fbb4e210
--- /dev/null
+++ b/src/checkstyle/com/puppycrawl/tools/checkstyle/checks/duplicates/messages.properties
@@ -0,0 +1 @@
+duplicates.lines=Found duplicate of {0} lines in {1}, starting from line {2}
\ No newline at end of file
diff --git a/src/checkstyle/com/puppycrawl/tools/checkstyle/checks/duplicates/package.html b/src/checkstyle/com/puppycrawl/tools/checkstyle/checks/duplicates/package.html
new file mode 100644
index 000000000..c4b0deca7
--- /dev/null
+++ b/src/checkstyle/com/puppycrawl/tools/checkstyle/checks/duplicates/package.html
@@ -0,0 +1,38 @@
+<html>
+<body>
+Checks that search for duplicate code.
+<p>
+There are many trade-offs when writing a duplicate code detection tool.
+Some of the conflicting goals are:
+<ul>
+<li>Fast</li>
+<li>Low memory usage</li>
+<li>Avoid false alarms</li>
+<li>Support multiple/arbitrary languages</li>
+<li>Support Fuzzy matches (comments, whitespace, linebreaks, variable renaming, etc.)</li>
+</ul>
+</p>
+<p>
+StrictDuplicateCodeCheck is fast enough to facilitate checking very large code
+bases in acceptable time (minutes). It consumes very little memory, false
+alarms are not impossible but a really rare case. While it supports multiple
+languages it does not support fuzzy matches (that's why it's called Strict).
+</p>
+<p>
+Note that there are brilliant commercial implementations of duplicate code
+detection tools. One that is particularly noteworthy is
+<a href="http://www.redhillconsulting.com.au/products/simian/">Simian</a>
+from RedHill Consulting, Inc.
+</p>
+<p>
+Simian has managed to find a very good balance of the above tradeoffs.
+It is superior to the checks in this package in many repects.
+Simian is reasonably priced (free for noncommercial projects)
+and includes a Checkstyle plugin.
+<strong>
+We encourage all users of Checkstyle to evaluate Simian as an
+alternative to the Checks we offer in our distribution.
+</strong>
+</p>
+</body>
+</html>
\ No newline at end of file