@article{M37CCBA33, title = "Distributed Processing System Design and Implementation for Feature Extraction from Large-Scale Malicious Code", journal = "KIPS Transactions on Computer and Communication Systems", year = "2019", issn = "2287-5891", doi = "https://doi.org/10.3745/KTCCS.2019.8.2.35", author = "Hyunjong Lee, Seongyul Euh, Doosung Hwang", keywords = "Distributed Processing System, Malware Detection, Feature Extraction, Machine Learning", abstract = "Traditional Malware Detection is susceptible for detecting malware which is modified by polymorphism or obfuscation technology. By learning patterns that are embedded in malware code, machine learning algorithms can detect similar behaviors and replace the current detection methods. Data must collected continuously in order to learn malicious code patterns that change over time. However, the process of storing and processing a large amount of malware files is accompanied by high space and time complexity. In this paper, an HDFS-based distributed processing system is designed to reduce space complexity and accelerate feature extraction time. Using a distributed processing system, we extract two API features based on filtering basis, 2-gram feature and APICFG feature and the generalization performance of ensemble learning models is compared. In experiments, the time complexity of the feature extraction was improved about 3.75 times faster than the processing time of a single computer, and the space complexity was about 5 times more efficient. The 2-gram feature was the best when comparing the classification performance by feature, but the learning time was long due to high dimensionality." }