From 37df9e83bf6f4596a950ea41859af30538c89d46 Mon Sep 17 00:00:00 2001 From: Simon Suchomel Date: Tue, 14 Aug 2012 19:14:11 +0200 Subject: [PATCH] Uterni psani --- algorithm.sty | 100 ++++++++++++++++++ algorithmic.sty | 232 +++++++++++++++++++++++++++++++++++++++++ paper.bib | 21 ++++ paper.tex | 3 + simon-searchengine.tex | 137 +++++++++++++++++++++--- 5 files changed, 480 insertions(+), 13 deletions(-) create mode 100644 algorithm.sty create mode 100644 algorithmic.sty diff --git a/algorithm.sty b/algorithm.sty new file mode 100644 index 0000000..e999a44 --- /dev/null +++ b/algorithm.sty @@ -0,0 +1,100 @@ +%% +%% This is file `algorithm.sty', +%% generated with the docstrip utility. +%% +%% The original source files were: +%% +%% algorithms.dtx (with options: `algorithm') +%% This is a generated file. +%% +%% Copyright (C) 1994-2004 Peter Williams +%% Copyright (C) 2005-2009 Rog^^e9rio Brito +%% +%% This document file is free software; you can redistribute it and/or +%% modify it under the terms of the GNU Lesser General Public License as +%% published by the Free Software Foundation; either version 2 of the +%% License, or (at your option) any later version. +%% +%% This document file is distributed in the hope that it will be useful, but +%% WITHOUT ANY WARRANTY; without even the implied warranty of +%% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser +%% General Public License for more details. +%% +%% You should have received a copy of the GNU Lesser General Public License +%% along with this document file; if not, write to the Free Software +%% Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, +%% USA. +%% +\NeedsTeXFormat{LaTeX2e}[1999/12/01] +\ProvidesPackage{algorithm} + [2009/08/24 v0.1 Document Style `algorithm' - floating environment] +\RequirePackage{float} +\RequirePackage{ifthen} +\newcommand{\ALG@within}{nothing} +\newboolean{ALG@within} +\setboolean{ALG@within}{false} +\newcommand{\ALG@floatstyle}{ruled} +\newcommand{\ALG@name}{Algorithm} +\newcommand{\listalgorithmname}{List of \ALG@name s} +% Declare Options: +% * first: appearance +\DeclareOption{plain}{ + \renewcommand{\ALG@floatstyle}{plain} +} +\DeclareOption{ruled}{ + \renewcommand{\ALG@floatstyle}{ruled} +} +\DeclareOption{boxed}{ + \renewcommand{\ALG@floatstyle}{boxed} +} +% * then: numbering convention +\DeclareOption{part}{ + \renewcommand{\ALG@within}{part} + \setboolean{ALG@within}{true} +} +\DeclareOption{chapter}{ + \renewcommand{\ALG@within}{chapter} + \setboolean{ALG@within}{true} +} +\DeclareOption{section}{ + \renewcommand{\ALG@within}{section} + \setboolean{ALG@within}{true} +} +\DeclareOption{subsection}{ + \renewcommand{\ALG@within}{subsection} + \setboolean{ALG@within}{true} +} +\DeclareOption{subsubsection}{ + \renewcommand{\ALG@within}{subsubsection} + \setboolean{ALG@within}{true} +} +\DeclareOption{nothing}{ + \renewcommand{\ALG@within}{nothing} + \setboolean{ALG@within}{true} +} +\DeclareOption*{\edef\ALG@name{\CurrentOption}} +% ALGORITHM +% +\ProcessOptions +\floatstyle{\ALG@floatstyle} +\ifthenelse{\boolean{ALG@within}}{ + \ifthenelse{\equal{\ALG@within}{part}} + {\newfloat{algorithm}{htbp}{loa}[part]}{} + \ifthenelse{\equal{\ALG@within}{chapter}} + {\newfloat{algorithm}{htbp}{loa}[chapter]}{} + \ifthenelse{\equal{\ALG@within}{section}} + {\newfloat{algorithm}{htbp}{loa}[section]}{} + \ifthenelse{\equal{\ALG@within}{subsection}} + {\newfloat{algorithm}{htbp}{loa}[subsection]}{} + \ifthenelse{\equal{\ALG@within}{subsubsection}} + {\newfloat{algorithm}{htbp}{loa}[subsubsection]}{} + \ifthenelse{\equal{\ALG@within}{nothing}} + {\newfloat{algorithm}{htbp}{loa}}{} +}{ + \newfloat{algorithm}{htbp}{loa} +} +\floatname{algorithm}{\ALG@name} +\newcommand{\listofalgorithms}{\listof{algorithm}{\listalgorithmname}} +\endinput +%% +%% End of file `algorithm.sty'. diff --git a/algorithmic.sty b/algorithmic.sty new file mode 100644 index 0000000..1e6e9b6 --- /dev/null +++ b/algorithmic.sty @@ -0,0 +1,232 @@ +%% +%% This is file `algorithmic.sty', +%% generated with the docstrip utility. +%% +%% The original source files were: +%% +%% algorithms.dtx (with options: `algorithmic') +%% This is a generated file. +%% +%% Copyright (C) 1994-2004 Peter Williams +%% Copyright (C) 2005-2009 Rog^^e9rio Brito +%% +%% This document file is free software; you can redistribute it and/or +%% modify it under the terms of the GNU Lesser General Public License as +%% published by the Free Software Foundation; either version 2 of the +%% License, or (at your option) any later version. +%% +%% This document file is distributed in the hope that it will be useful, but +%% WITHOUT ANY WARRANTY; without even the implied warranty of +%% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser +%% General Public License for more details. +%% +%% You should have received a copy of the GNU Lesser General Public License +%% along with this document file; if not, write to the Free Software +%% Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, +%% USA. +%% +\NeedsTeXFormat{LaTeX2e}[1999/12/01] +\ProvidesPackage{algorithmic} + [2009/08/24 v0.1 Document Style `algorithmic'] +% The algorithmic.sty package: +\RequirePackage{ifthen} +\RequirePackage{keyval} +\newboolean{ALC@noend} +\setboolean{ALC@noend}{false} +\newcounter{ALC@unique} % new counter to make lines numbers be internally +\setcounter{ALC@unique}{0} % different in different algorithms +\newcounter{ALC@line} % counter for current line +\newcounter{ALC@rem} % counter for lines not printed +\newcounter{ALC@depth} +\newlength{\ALC@tlm} +% +\DeclareOption{noend}{\setboolean{ALC@noend}{true}} +% +\ProcessOptions +% +% For keyval-style options +\def\algsetup{\setkeys{ALG}} +% +% For indentation of algorithms +\newlength{\algorithmicindent} +\setlength{\algorithmicindent}{0pt} +\define@key{ALG}{indent}{\setlength{\algorithmicindent}{#1}} +\ifthenelse{\lengthtest{\algorithmicindent=0pt}}% + {\setlength{\algorithmicindent}{1em}}{} +% +% For line numbers' delimiters +\newcommand{\ALC@linenodelimiter}{:} +\define@key{ALG}{linenodelimiter}{\renewcommand{\ALC@linenodelimiter}{#1}} +% +% For line numbers' size +\newcommand{\ALC@linenosize}{\footnotesize} +\define@key{ALG}{linenosize}{\renewcommand{\ALC@linenosize}{#1}} +% +% ALGORITHMIC +\newcommand{\algorithmicrequire}{\textbf{Require:}} +\newcommand{\algorithmicensure}{\textbf{Ensure:}} +\newcommand{\algorithmiccomment}[1]{\{#1\}} +\newcommand{\algorithmicend}{\textbf{end}} +\newcommand{\algorithmicif}{\textbf{if}} +\newcommand{\algorithmicthen}{\textbf{then}} +\newcommand{\algorithmicelse}{\textbf{else}} +\newcommand{\algorithmicelsif}{\algorithmicelse\ \algorithmicif} +\newcommand{\algorithmicendif}{\algorithmicend\ \algorithmicif} +\newcommand{\algorithmicfor}{\textbf{for}} +\newcommand{\algorithmicforall}{\textbf{for all}} +\newcommand{\algorithmicdo}{\textbf{do}} +\newcommand{\algorithmicendfor}{\algorithmicend\ \algorithmicfor} +\newcommand{\algorithmicwhile}{\textbf{while}} +\newcommand{\algorithmicendwhile}{\algorithmicend\ \algorithmicwhile} +\newcommand{\algorithmicloop}{\textbf{loop}} +\newcommand{\algorithmicendloop}{\algorithmicend\ \algorithmicloop} +\newcommand{\algorithmicrepeat}{\textbf{repeat}} +\newcommand{\algorithmicuntil}{\textbf{until}} +\newcommand{\algorithmicprint}{\textbf{print}} +\newcommand{\algorithmicreturn}{\textbf{return}} +\newcommand{\algorithmicand}{\textbf{and}} +\newcommand{\algorithmicor}{\textbf{or}} +\newcommand{\algorithmicxor}{\textbf{xor}} +\newcommand{\algorithmicnot}{\textbf{not}} +\newcommand{\algorithmicto}{\textbf{to}} +\newcommand{\algorithmicinputs}{\textbf{inputs}} +\newcommand{\algorithmicoutputs}{\textbf{outputs}} +\newcommand{\algorithmicglobals}{\textbf{globals}} +\newcommand{\algorithmicbody}{\textbf{do}} +\newcommand{\algorithmictrue}{\textbf{true}} +\newcommand{\algorithmicfalse}{\textbf{false}} +\def\ALC@item[#1]{% +\if@noparitem \@donoparitem + \else \if@inlabel \indent \par \fi + \ifhmode \unskip\unskip \par \fi + \if@newlist \if@nobreak \@nbitem \else + \addpenalty\@beginparpenalty + \addvspace\@topsep \addvspace{-\parskip}\fi + \else \addpenalty\@itempenalty \addvspace\itemsep + \fi + \global\@inlabeltrue +\fi +\everypar{\global\@minipagefalse\global\@newlistfalse + \if@inlabel\global\@inlabelfalse \hskip -\parindent \box\@labels + \penalty\z@ \fi + \everypar{}}\global\@nobreakfalse +\if@noitemarg \@noitemargfalse \if@nmbrlist \refstepcounter{\@listctr}\fi \fi +\sbox\@tempboxa{\makelabel{#1}}% +\global\setbox\@labels + \hbox{\unhbox\@labels \hskip \itemindent + \hskip -\labelwidth \hskip -\ALC@tlm + \ifdim \wd\@tempboxa >\labelwidth + \box\@tempboxa + \else \hbox to\labelwidth {\unhbox\@tempboxa}\fi + \hskip \ALC@tlm}\ignorespaces} +% +\newenvironment{algorithmic}[1][0]{ +\setcounter{ALC@depth}{\@listdepth}% +\let\@listdepth\c@ALC@depth% +\let\@item\ALC@item% + \newcommand{\ALC@lno}{% +\ifthenelse{\equal{\arabic{ALC@rem}}{0}} +{{\ALC@linenosize \arabic{ALC@line}\ALC@linenodelimiter}}{}% +} +\let\@listii\@listi +\let\@listiii\@listi +\let\@listiv\@listi +\let\@listv\@listi +\let\@listvi\@listi +\let\@listvii\@listi + \newenvironment{ALC@g}{ + \begin{list}{\ALC@lno}{ \itemsep\z@ \itemindent\z@ + \listparindent\z@ \rightmargin\z@ + \topsep\z@ \partopsep\z@ \parskip\z@\parsep\z@ + \leftmargin \algorithmicindent%1em + \addtolength{\ALC@tlm}{\leftmargin} + } + } + {\end{list}} + \newcommand{\ALC@it}{% + \stepcounter{ALC@rem}% + \ifthenelse{\equal{\arabic{ALC@rem}}{#1}}{\setcounter{ALC@rem}{0}}{}% + \stepcounter{ALC@line}% + \refstepcounter{ALC@unique}% + \item\def\@currentlabel{\theALC@line}% + } + \newcommand{\ALC@com}[1]{\ifthenelse{\equal{##1}{default}}% +{}{\ \algorithmiccomment{##1}}} + \newcommand{\REQUIRE}{\item[\algorithmicrequire]} + \newcommand{\ENSURE}{\item[\algorithmicensure]} + \newcommand{\PRINT}{\ALC@it\algorithmicprint{} \ } + \newcommand{\RETURN}{\ALC@it\algorithmicreturn{} \ } + \newcommand{\TRUE}{\algorithmictrue{}} + \newcommand{\FALSE}{\algorithmicfalse{}} + \newcommand{\AND}{\algorithmicand{} } + \newcommand{\OR}{\algorithmicor{} } + \newcommand{\XOR}{\algorithmicxor{} } + \newcommand{\NOT}{\algorithmicnot{} } + \newcommand{\TO}{\algorithmicto{} } + \newcommand{\STATE}{\ALC@it} + \newcommand{\STMT}{\ALC@it} + \newcommand{\COMMENT}[1]{\algorithmiccomment{##1}} + \newenvironment{ALC@inputs}{\begin{ALC@g}}{\end{ALC@g}} + \newenvironment{ALC@outputs}{\begin{ALC@g}}{\end{ALC@g}} + \newenvironment{ALC@globals}{\begin{ALC@g}}{\end{ALC@g}} + \newenvironment{ALC@body}{\begin{ALC@g}}{\end{ALC@g}} + \newenvironment{ALC@if}{\begin{ALC@g}}{\end{ALC@g}} + \newenvironment{ALC@for}{\begin{ALC@g}}{\end{ALC@g}} + \newenvironment{ALC@whl}{\begin{ALC@g}}{\end{ALC@g}} + \newenvironment{ALC@loop}{\begin{ALC@g}}{\end{ALC@g}} + \newenvironment{ALC@rpt}{\begin{ALC@g}}{\end{ALC@g}} + \renewcommand{\\}{\@centercr} + \newcommand{\INPUTS}[1][default]{\ALC@it\algorithmicinputs\ \ALC@com{##1}\begin{ALC@inputs}} + \newcommand{\ENDINPUTS}{\end{ALC@inputs}} + \newcommand{\OUTPUTS}[1][default]{\ALC@it\algorithmicoutputs\ \ALC@com{##1}\begin{ALC@outputs}} + \newcommand{\ENDOUTPUTS}{\end{ALC@outputs}} + \newcommand{\GLOBALS}{\ALC@it\algorithmicglobals\ } + \newcommand{\BODY}[1][default]{\ALC@it\algorithmicbody\ \ALC@com{##1}\begin{ALC@body}} + \newcommand{\ENDBODY}{\end{ALC@body}} + \newcommand{\IF}[2][default]{\ALC@it\algorithmicif\ ##2\ \algorithmicthen% +\ALC@com{##1}\begin{ALC@if}} + \newcommand{\ELSE}[1][default]{\end{ALC@if}\ALC@it\algorithmicelse% +\ALC@com{##1}\begin{ALC@if}} + \newcommand{\ELSIF}[2][default]% +{\end{ALC@if}\ALC@it\algorithmicelsif\ ##2\ \algorithmicthen% +\ALC@com{##1}\begin{ALC@if}} + \newcommand{\FOR}[2][default]{\ALC@it\algorithmicfor\ ##2\ \algorithmicdo% +\ALC@com{##1}\begin{ALC@for}} + \newcommand{\FORALL}[2][default]{\ALC@it\algorithmicforall\ ##2\ % +\algorithmicdo% +\ALC@com{##1}\begin{ALC@for}} + \newcommand{\WHILE}[2][default]{\ALC@it\algorithmicwhile\ ##2\ % +\algorithmicdo% +\ALC@com{##1}\begin{ALC@whl}} + \newcommand{\LOOP}[1][default]{\ALC@it\algorithmicloop% +\ALC@com{##1}\begin{ALC@loop}} + \newcommand{\REPEAT}[1][default]{\ALC@it\algorithmicrepeat% +\ALC@com{##1}\begin{ALC@rpt}} + \newcommand{\UNTIL}[1]{\end{ALC@rpt}\ALC@it\algorithmicuntil\ ##1} + \ifthenelse{\boolean{ALC@noend}}{ + \newcommand{\ENDIF}{\end{ALC@if}} + \newcommand{\ENDFOR}{\end{ALC@for}} + \newcommand{\ENDWHILE}{\end{ALC@whl}} + \newcommand{\ENDLOOP}{\end{ALC@loop}} + }{ + \newcommand{\ENDIF}{\end{ALC@if}\ALC@it\algorithmicendif} + \newcommand{\ENDFOR}{\end{ALC@for}\ALC@it\algorithmicendfor} + \newcommand{\ENDWHILE}{\end{ALC@whl}\ALC@it\algorithmicendwhile} + \newcommand{\ENDLOOP}{\end{ALC@loop}\ALC@it\algorithmicendloop} + } + \renewcommand{\@toodeep}{} + \begin{list}{\ALC@lno}{\setcounter{ALC@rem}{0}\setcounter{ALC@line}{0}% + \itemsep\z@ \itemindent\z@ \listparindent\z@% + \partopsep\z@ \parskip\z@ \parsep\z@% + \labelsep 0.5em \topsep 0.2em% +\ifthenelse{\equal{#1}{0}} + {\labelwidth 0.5em } + {\labelwidth 1.2em } +\leftmargin\labelwidth \addtolength{\leftmargin}{\labelsep} + \ALC@tlm\labelsep + } +} +{\end{list}} +\endinput +%% +%% End of file `algorithmic.sty'. diff --git a/paper.bib b/paper.bib index 97666d3..4dd87c6 100755 --- a/paper.bib +++ b/paper.bib @@ -27,6 +27,27 @@ year = "2012", } +@book{ManningRaghavanSchuetze08, + abstract = {Class-tested and coherent, this textbook teaches classical and web information retrieval, including web search and the related areas of text classification and text clustering from basic concepts. It gives an up-to-date treatment of all aspects of the design and implementation of systems for gathering, indexing, and searching documents; methods for evaluating systems; and an introduction to the use of machine learning methods on text collections. All the important ideas are explained using examples and figures, making it perfect for introductory courses in information retrieval for advanced undergraduates and graduate students in computer science. Based on feedback from extensive classroom experience, the book has been carefully structured in order to make teaching more natural and effective.}, + added-at = {2012-05-30T10:50:27.000+0200}, + address = {Cambridge, UK}, + author = {Manning, Christopher D. and Raghavan, Prabhakar and Sch{\"u}tze, Hinrich}, + biburl = {http://www.bibsonomy.org/bibtex/28516d94c1f7aa1e391ddd3ace4caa23b/flint63}, + file = {Cambridge University Press Product Page:http\://www.cambridge.org/9780521865715:URL;Amazon Search inside:http\://www.amazon.de/gp/reader/0521865719/:URL;Google Books:http\://books.google.de/books?isbn=978-0-521-86571-5:URL}, + PAGES = {118-120}, + groups = {public}, + interhash = {b6954037b1d444f4afe4cad883b4d80c}, + intrahash = {8516d94c1f7aa1e391ddd3ace4caa23b}, + isbn = {978-0-521-86571-5}, + keywords = {v1205 book ai information retrieval language processing search xml web}, + publisher = {Cambridge University Press}, + timestamp = {2012-05-30T10:50:27.000+0200}, + title = {Introduction to Information Retrieval}, + username = {flint63}, + year = 2008 +} + + @INPROCEEDINGS{Knight, author = {Allan Knight and Kevin Almeroth and Bruce Bimber}, title = {An Automated System for Plagiarism Detection Using the Internet}, diff --git a/paper.tex b/paper.tex index 92e4a39..9a90757 100755 --- a/paper.tex +++ b/paper.tex @@ -4,6 +4,9 @@ \usepackage[utf8]{inputenc} \usepackage{times} \usepackage{graphicx} +\usepackage{algorithm} +\usepackage{algorithmic} +\usepackage{amssymb} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{document} diff --git a/simon-searchengine.tex b/simon-searchengine.tex index e50f3ef..22f0bff 100755 --- a/simon-searchengine.tex +++ b/simon-searchengine.tex @@ -12,10 +12,10 @@ In the PAN 12 test corpus, there were 32 text documents all contained at least o The documents were approximately 30 KB of size, the smallest were 18 KB and the largest were 44 KB. We have used methods which are applicable in general to any type of text input with no apriori information about the input document. -All queries from a given document were firstly preconstructed and they were afterwards sequentially executed -according to their priority. After each executed query the intermediate results of plagiarism -were calculated using the multi feature document comparison techniques described in section~\ref{yenya}. -This process leads to continuing lowering the number of executed queries, for more details see the following passages. +%All queries from a given document were firstly preconstructed and they were afterwards sequentially executed +%according to their priority. After each executed query the intermediate results of plagiarism +%were calculated using the multi feature document comparison techniques described in section~\ref{yenya}. +%This process leads to continuing lowering the number of executed queries, for more details see the following passages. We can divide constructed queries into three main groups according to their origin: @@ -32,11 +32,12 @@ These types of words are usually refered as the stop words. Thus we firstly filter out stop words according to general English stop words list. Secondly we used reference corpus word list as an additional filter. We used words in their simple verbatim occurrence allied to a -statistical estimate of likelihood. As a result of this a word to become a key +statistical estimate of likelihood. As a result of this, a word to become a key must not exists in the stop list and must occur in the suspicious document relatively much more frequently than in the reference corpus. We used a corpus created from common English web sources, which contains more then 4 billion tokens~\cite{ententen}. -The statistical measure TF-IDF was applied in order to decide whether the processed word shall become a keyword. +The statistical measure TF-IDF~\cite{Introduction_to_information_retrieval} +was applied in order to decide whether the processed word shall become a keyword. The TF-IDF combines term frequency (TF) and inverse document frequency (IDF) to produce composite weight of a given term. Taking into account the average length of the suspicious document and minimizing the number of used queries we set TF-IDF weight threshold to 0.02, which resulted in about 10 to 20 terms identified as keywords per document. @@ -86,7 +87,7 @@ The values of window size and span of shifting was set empirically during traini largest change of AWFC compared to AWFC of their neighbouring previous window were selected as candidate for query extraction. -The query from the representative sentence were composed by 6 words from the sentence beginning, also omitting stop words. +The query from the representative sentence was composed by 6 words from the sentence beginning, also omitting stop words. We chose 6 words to cover higher portion of selected sentence and not to exceed the phrasal ChatNoir query limit. This type of query is actually looking for documents containing the very same or at least the very similar sentence, @@ -98,19 +99,129 @@ expected even better results from those queries. The intrinsic plagiarism based queries are document positionable, meaning that we can store their position within the document. It is due to the fact, that -those queries are created from more or less subsequent words. As a result of that they are also executed conditionally, for more +those queries are created from more or less subsequent words. As a result, they are also executed conditionally, for more information see section~\ref{process}. The position within the document were set to the start position of the representative sentence. They characterize only a specific portion of the suspicious document on the contrary to keyword based queries. \subsection{Headers based queries} -The last type of queries are queries constructed from local headers of document. -The header usually characterize briefly and accurately following section, thus it -can be used as a search query. We considered headers only from 2 to 6 words in length. +The last type of queries were constructed from local headers of the document. +A header usually characterize briefly and accurately the following section, thus it +can be used as a search query. In real scenarios the headers should +be located using some document metadata, in that way we can distinguish for example more header levels. +In the case of PAN 12 competition we have detected headers according to several +basic rules. Those basic rules applied on most headers from given test corpus text files. +As a header is considered any line which has an empty line above and bellow the actual line +and is from 2 to 6 words in length also omitting stop words. +Note that length of header based queries are variable, thus they can be both specific +and general according to their length. They are also document positionable. +We calculated their position as start position of the following text. -\subsection{Input document processing}~\label{process} +\subsection{Combining and executing queries}~\label{process} -\subsection{Queries comparison} +After each executed query the intermediate results of plagiarism +were calculated using the multi feature document comparison techniques described in section~\ref{yenya}. +This process leads to continuing lowering the number of executed queries, for more details see the following passages. + +All queries from a given document were firstly preconstructed and they were afterwards sequentially executed +according to their priority. +The first all keyword based queries, the second +intrinsic plagiarism based and the last headers based, whilst we could omit some of the positionable queries during the process, +which is described further in this section. +The queries processing is outlined as Algorithm~\ref{alg:queries}, where for simplicity the snippet calculation and a web source download is omitted. +After executing a query the intermediate results were always +calculated as follows: For every query result in result page based +on a snippet to input suspicious document similarities, we decided whether to actually +download the resulting URL or not. Since the snippet contains portions of Web document to each +given keyword, we calculated pre-similarity as apportionment of every word match between the snipped and the suspicious document. +If there were at least 80\% match, we downloaded the web source for thorough investigation. +For each downloaded web document we calculated similarities with the input suspicious document +in the same way as described in section~\ref{yenya}. All similarities has been stored in form of intervals from within +the input suspicious document. In other words for every similar part between the downloaded +document and the suspicious document we stored the beginning position and the ending position of that part in +the suspicious document. + +As a result of that, we can during processing further queries, which haven't been executed yet, omit those +of which document position intersects with any of already found similarities intervals. This procedure +leads us to lowering the total number of executed queries. +All downloaded documents, which have at least one similarity interval, are declared +as possible source of the plagiarism. + + +\renewcommand{\algorithmicrequire}{\textbf{Input:}} +\renewcommand{\algorithmicensure}{\textbf{Output:}} + + +\algsetup{indent=2em} +\begin{algorithm}[h!] +\caption{Queries execution}\label{alg:queries} +\begin{algorithmic}[1] +\REQUIRE suspicious document $d$, queries $Q_{KW}, Q_{Intrinsic}, Q_{Headers}$ +%keyword, intrinsic and headers based queries $Q_{KW}, Q_{Intrinsic}, Q_{Headers}$, + +\ENSURE plagiarism source candidate web documents $W_{plag}$ +\medskip + +\STATE $I \leftarrow \emptyset; I \in \mathbb{N}^2$ +\STATE $W \leftarrow \emptyset$ + +\FORALL{$q \in (Q_{KW} \cup Q_{Intrinsic} \cup Q_{Headers}) $} + \IF{$position(q)$ do not intersects with any interval $ \vec{i} \in I$} + \STATE $W \leftarrow execute(q)$ + \FORALL{$w \in W$} + \STATE $I_{sub} \leftarrow get\_similarities(w, d)$ %\COMMENT{output is set of intervals similarities $[d_{start},d_{end}]$} + \STATE $I \leftarrow I \cup I_{sub}$ + \IF{$I_{sub} \neq \emptyset$} + \STATE $W_{plag} \leftarrow W_{plag} \cup \{w\}$ + \ENDIF + \ENDFOR + \ENDIF +\ENDFOR +\RETURN $W_{plag}$ + +\end{algorithmic} +\end{algorithm} + +\subsection{Queries comparison} +During the test phase there were extracted 133 keywords based queries, 165 intrinsic plagiarism +based queries and 331 headers based queries in total. Table~\ref{querycount} shows the arithmetic mean +of the query count per document. We can see that nearly half of the prepared queries were +omitted due to the fact, that there had been found a similarity including their document position. + +\begin{center} +\begin{table}[h] +\begin{center} +\caption{\footnotesize The arithmetic mean of the query count and the omitted query count per document} +\vspace{0.3 cm} +\begin{tabular}{|c|c|c|} +\hline +\label{querycount} +{\bf Query type} & {\bf Extracted}& {\bf Omitted} \\ \hline \hline +KW & 4.16 & N/A \\ \hline +Intrinsic & 5.16 & 2.35 \\ \hline +Headers & 10.34 & 4.75 \\ \hline +\end{tabular}\\ +\end{center} +\end{table} +\end{center} + + +\begin{center} +\begin{table}[h] +\begin{center} +\caption{\footnotesize The total portion of similarities found, taking into account number of similarities regardless of interval sizes.} +\vspace{0.3 cm} +\begin{tabular}{|c|c|} +\hline +\label{querySuccess} +{\bf Query type} & {\bf Similarities portion } \\ \hline \hline +KW & 72.5 \% \\ \hline +Intrinsic & 24.3 \% \\ \hline +Headers & 10.34 \% \\ \hline +\end{tabular}\\ +\end{center} +\end{table} +\end{center} -- 2.43.5