X-Git-Url: https://www.fi.muni.cz/~kas/git//home/kas/public_html/git/?a=blobdiff_plain;ds=sidebyside;f=pan13-poster%2Fposter.tex;h=bfd9762556c9faa8c86de36a7761b656d1fab521;hb=fa03e2ee0827e56f549a32c1782cde605b062e60;hp=ce9f7a324da40f830b3ffbf36fe21c3f284e6839;hpb=c091d54abf1aa848d425ee061f66004bfaab69c5;p=pan13-paper.git diff --git a/pan13-poster/poster.tex b/pan13-poster/poster.tex old mode 100644 new mode 100755 index ce9f7a3..bfd9762 --- a/pan13-poster/poster.tex +++ b/pan13-poster/poster.tex @@ -4,9 +4,15 @@ \usepackage{amsmath} \usepackage{amssymb} \usepackage{multicol} -\usepackage{bera} \usepackage[utf8]{inputenc} %\usepackage{fancybullets} +%\usepackage{floatflt} +%\usepackage{graphics} +\usepackage{fontspec} +\usepackage{xunicode} +\setmainfont[Mapping=tex-text]{DejaVu Sans} +\setsansfont[Mapping=tex-text]{DejaVu Sans} +\setmonofont[Mapping=tex-text]{DejaVu Sans Mono} \definecolor{BoxCol}{rgb}{0.9,0.9,1} % uncomment for light blue background to \section boxes @@ -18,7 +24,7 @@ \definecolor{ReallyEmph}{rgb}{0.7,0,0} \renewcommand{\titlesize}{\Huge} -\title{Distributed System \\ for Discovering Similar Documents} +\title{Diverse Queries and Feature Type Selection for Plagiarism Discovery} % Note: only give author names, not institute \author{Šimon Suchomel, Jan Kasprzak, and Michal Brandejs} @@ -45,10 +51,17 @@ } { \end{itemize} } -\conference{{\bf ICEIS 2008}, 12--16 June 2008, Barcelona, Spain} +\conference{{\bf CLEF 2013}, 23--27 September 2013, Valencia, Spain} \setlength{\figbotskip}{\smallskipamount} +\renewcommand{\SubSection}[2][?]{ + \vspace{0.5\secskip} + \refstepcounter{subsection} + {\bf \subsectionsize \textcolor{SectionCol}{\arabic{section}.\arabic{subsection}~#2}} + \par\vspace{0.375\secskip} +} + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%% Begin of Document @@ -106,46 +119,216 @@ \vspace{-.02\textwidth} %%% Begin of Multicols-Enviroment +%\begin{abstract} +%{\sffamily\itshape +%Nějaký abstrakt. +%} +%\end{abstract} + + +\begin{multicols}{2}\setlength{\columnseprule}{0pt} +\section{Introduction} +% +A program for helping detering real-world plagiarism needs to accomplish many tasks. +Original documents which served for creation of plagiarism must be retrieved and also suspicious passages according to +input document must be highlighted. This poster presents methodology used during PAN2013 competition on uncovering plagiarism. + +The whole process is depicted at picture~\ref{fig:process}. The source retrieval task is divided into +2 subtasks: Quering and Selecting, during which the software utilizes a given search engine. The retrieved +sources must be examined in detail in order to highlight as many plagiarism cases as possible. This process is depicted +as Text Alignment. Results of this process are called {\em detections}, i.e.~passages of {\em source document} and {\em suspicious document}, which are similar enough to each other, and can serve as a basis for further manual examination for possible plagiarism. +% +\vfill +\columnbreak +% +\begin{figure} + \centering + \includegraphics[width=0.8\textwidth]{img/source_retrieval_process.pdf} + \caption{Plagiarism discovery process.} + \label{fig:process} +\end{figure} +\end{multicols} \begin{multicols}{2} +%\rm +%%% Introduction +\section{Querying} +Querying means to effectively utilize a search engine in order to retrieve as many relevant +documents as possible with the minimum amount of queries. +%We consider the resulting document relevantif it shares some of text characteristics with the suspicious document. +In real-world, queries as such represent appreciable cost, therefore their quantity minimization should be one of the top priorities. +%\subsection{Types of Queries} +During initial phase, there were three diverse types of queries extracted from each suspicious document.\\ +\begin{minipage}{0.55\linewidth} +\subsection{Keywords Based Queries} +\begin{ytemize} +\item TF--IDF base automated keywords extraction; +\item 5-token long; +\item Deterministic; +\item Non-positional; +\item Non-phrasal. +\end{ytemize} +\end{minipage} +\begin{minipage}{0.45\linewidth} +\begin{figure}[h] + %\centering + \includegraphics[width=1\linewidth]{img/document_keywords.pdf} +\end{figure} +\end{minipage} +\begin{minipage}{0.55\linewidth} +\subsection{Intrinsic Plagiarism Based Queries} +\begin{ytemize} +\item Averaged Word Frequency Class based chunking~\cite{awfc}; +\item Random sentence selection from the chunk; +\item Non-deterministic; +\item Positional; +\item Phrasal. +\end{ytemize} +\end{minipage} +\begin{minipage}{0.45\linewidth} +\begin{figure}[h] + %\centering + \includegraphics[width=1\linewidth]{img/document_awfc.pdf} +\end{figure} +\end{minipage} +\begin{minipage}{0.55\linewidth} +\subsection{Paragraph Based Queries} +\begin{ytemize} +\item Longest sentences from miscellaneous paragraphs; +\item Deterministic; +\item Positional; +\item Phrasal. +\end{ytemize} +\end{minipage} +\begin{minipage}{0.45\linewidth} +\begin{figure}[h] + %\centering + \includegraphics[width=1\linewidth]{img/document_paragraphs.pdf} +\end{figure} +\end{minipage} -\rm -%%% Abstract -\begin{abstract} +\begin{figure}[h] + \centering + \includegraphics[width=0.8\linewidth]{img/queryprocess.pdf} + \caption{Stepwise queries execution process.} +\end{figure} + +\section{Selecting} +Document snippets were used for deciding whether to download the document for the text alignment. +We used 2-tuples measurement, which indicates how many neighbouring word pairs coexist in the snippet and in the suspicious document. +Performance of this measure is depicted at Figure~\ref{fig:snippet_graph}. +Having this measure, a threshold for download decision needs to be set in order to maximize all discovered similarities +and minimize total downloads. +A profitable threshold is such that matches with the largest distance between those two curves. + +\begin{figure} + \centering + \includegraphics[width=0.8\textwidth]{img/snippets_graph.pdf} + \caption{Downloads and similarities performance.} + \label{fig:snippet_graph} +\end{figure} +% +% Yenyova cast +% +\section{Text Alignment} + +The system uses the same basic principles as in \cite{suchomel_kas_12}: + +\begin{ytemize} +\item{\cemph{common features} between source and suspicious documents} +\begin{ytemize} +\item{word 5-grams} +\item{stop-word 8-grams \cite{stamatatos2011plagiarism}} +\end{ytemize} +\item{\cemph{valid intervals} of characters covered by common features + ``densely enough''} +\item{\cemph{postprocessing}---remove overlapping detections, + join neighbouring detections} +\end{ytemize} + +\subsection{Alternative Features} + +\begin{ytemize} +\item{\cemph{contextual n-grams} \cite{torrejondetailed}} +\begin{ytemize} +\item{\cemph{The quick} brown \cemph{fox jumped} over the lazy dogs.} +\item{The \cemph{quick brown} fox \cemph{jumped over} the lazy dogs.} +\end{ytemize} +\item{plain word 4-grams} +\begin{ytemize} +\item{\cemph{The quick brown fox} jumped over the lazy dogs.} +\item{The \cemph{quick brown fox jumped} over the lazy dogs.} +\end{ytemize} +\end{ytemize} + +\begin{table} -{\sffamily\itshape +\begin{center} +\begin{tabular}{|l|r|r|r|r|} +\hline +\bf feature & \bf recall & \bf precision & \bf granularity & plagdet \\ +\hline +plain 5-grams & 0.6306 & 0.8484 & 1.0000 & \cemph{0.7235} \\ +contextual 4-grams & 0.6721 & \cemph{0.8282} & 1.0000 & \cemph{0.7421} \\ +plain 4-grams & \cemph{0.7556} & 0.7340 & 1.0000 & \cemph{0.7447} \\ +\hline +\end{tabular} +\end{center} -Nějaký abstrakt. +\caption{Comparison of contextual 4-grams and plain word 4-grams} +\end{table} -} -\end{abstract} +\subsection{Global Postprocessing} -%%% Introduction -\section{Šimonova část} - -\subsection{Kdovíco} +\begin{ytemize} +\item{Similar to PAN 2010 \cite{Kasprzak2010}} +\item{Overlapping detections removal} +\item{\cemph{Result:} improvement, but not as significant as in 2010} +\end{ytemize} -\section{Yenyova část} +% +% Spolecna cast +% \section{Conclusion} -Nějaký závěr +\subsection{Candidate retrieval} -%%% References +\begin{ytemize} +\item{Second best ratio of recall to the number of queries} +\item{Missing support for phrasal search in ChatNoir is a big stumbling block} +\end{ytemize} -%% Note: use of BibTeX als works!! +\subsection{Text alignment} -\bibliographystyle{plain} -\begin{thebibliography}{1} +\begin{ytemize} +\item{Significant improvement against PAN 2013} +\item{Word 4-grams are better than contextual 4-grams} +\item{We need a better ranking system than plagdet!} +\end{ytemize} -\bibitem{ISMU} -\cemph{Masaryk University Information System}\\ -{\tt http://is.muni.cz/}, contact: {\tt iscor@fi.muni.cz}. +%%% References -\bibitem{Theses} -\cemph{Czech National Archive of Graduate Theses}\\ -{\tt http://theses.cz/}, contact: {\tt theses@fi.muni.cz}. +%% Note: use of BibTeX als works!! -\end{thebibliography} +\bibliographystyle{plain} +\bibliography{pan13-notebook} +\nocite{awfc} + +%\begin{thebibliography}{1} +% +%\bibitem{ISMU} +%\cemph{Masaryk University Information System}\\ +%{\tt http://is.muni.cz/}, contact: {\tt iscor@fi.muni.cz}. +% +%\bibitem{Theses} +%\cemph{Czech National Archive of Graduate Theses}\\ +%{\tt http://theses.cz/}, contact: {\tt theses@fi.muni.cz}. +% +%\bibitem{AWFC} +%\cemph{Sven Meyer Zu Eissen and Benno Stein: Intrinsic Plagiarism Detection}\\ +%{\tt Proceedings of the European Conference on Information Retrieval (ECIR-06)}, {\tt 2006} +% +%\end{thebibliography} \smallskip \hrule height .1em @@ -153,14 +336,20 @@ Nějaký závěr % \sffamily -QR kód? +\hbox to \hsize{ + {\hsize=0.5\hsize\vbox{ \cemph{Contact information:}\\ - Šimon Suchomel {\tt suchomel@fi.muni.cz},\\ - Jan Kasprzak, {\tt kas@fi.muni.cz}. - + Šimon Suchomel {\tt suchomel@fi.muni.cz}\\ + Jan Kasprzak {\tt kas@fi.muni.cz}\\ + {\cemph{\tt http://www.fi.muni.cz/\~{}kas/pan13/}} +} + \hfill + {\hsize=0.4\hsize\vbox{ + \includegraphics[width=\hsize]{qrcode.png} +}}}} + \end{multicols} \end{document} -