1234563 735287393A5B3C D396EF38BE888E38E8538582E 6B!38 58"!B8#8 2!$A8B38DE2%#E& #8E8F58'385"523C88!#858$!(385BB3FB8BE83$6E25A38(59!B823#35268!$8B38 52358E488!$8$ !5&88D38#B52B3 8"!B8B"E8E')36B!3#8*8BE85BB256B8FE238E48E28565 3F!58BE8 52B!4!6!598!$B399!A3$6385#8B3!28BE!68E4823#352685$ 8BE8398BE#382#!$A823#35268!$88 2395B3 8BE!6#C83$5$638B38(59!B8E4823#3526&88738BE8#E2B5A38E485 3(5B398(59!4!3 85$ 8 3+32!3$63 823#352632#8!$8C85#!2!$A823#352632#85$ 84569B8F3F'32#85238E4B3$82396B5$B8 BE8!6%85$8523589!%38&88$ 8"3238B38 E8!6%88#68BE!6#C8B38 3B8!#8E4B3$89E"&888 '9!$ 859!65B!E$8E48B36$!(3#89!%38A3$3B!6859AE2!BF8E2858#B5$ 52 8$32598$3B"E2%8!#8E4B3$8 3+36B3 8BE8'38#33$85#823#3526&88,!F!9529C8 !! !$A8582E'93F8!$BE858$F'328E48FE 93#8-8 2E63##3#8A3B#8495AA3 85#8!$B399!A3$B85A3$B#&8.#!$A858#B5$ 52 8!F5A38!39!$!$A8"!B858$32598 $3B"E2%8695##!4!3284E28! 3$B!4!$A8B388$F3259#8!#85$EB3283+5F93&88/$3865$8#338F5$8 3+5F93#8E48#68#599E"88#5A3C8!48E8#65$8E286E$4323$63#&8 !#8!#8B38'56% 2E84E28E#!B!E$!$A885#8F3$B!E$3 83529!32&88D38E38E8"!998'3$34!B8 42EF8B3833$B85$ 85##8E$8B38"E2 8BE8E286E9935A3#85$ 842!3$ #8E$8E2823B2$& $8E2 328BE856!338B38E')36B!3#C8"38538B5%3$858$F'328E48#B3#85#852B8E48B38"E2%#E& •
• •
•
38!$!B3 8B59%#842EF83+32B#8!$8B384!39 8"E9 8A!38E858433984E28#EF38E48B38 65993$A!$A85235#8E4823#35268!$885$ 8EB9!$38#EF38E48BE#3865993$A3#&88738BE8 B!F386E$#B25!$B#C8"3853823#B2!6B3 8B38B59%#8BE8)#B85843"&8 8D38#5998B28BE86E328 EB3285235#8E$84B238E665#!E$#& 38532823#3$B5B!E$#85238'8BE#386223$B982#!$A85$882E'93F84E28B3!28 23#3526&883#38"E9 8#32385#865#38#B !3#8E48823#352684E28E28 393A5B3#85$ 85#8 23!3"8-8433 '56%84E2F84E28B3823#352632#& D38538585$398E483+32!3$63 8823#352632#8!$8B38532823#3$B5B!E$8#3##!E$#C8 "E 8 "E9 8 E4432 8 #AA3#B!E$# 8 !$ 8 23A52 8 BE 8 B3 8 532# 8 '3!$A 8 23#3$B3 & 8 8 /B328 393A5B3#8"E9 859#E8'3$34!B842EF8B!#8!$B3256B!E$8!$8$ 32#B5$ !$A86EFFE$8!B4599#8 5$ 86E$4#!E$#& D3 8 53 8 !$69 3 8 5 8 43" 8 52B!693# 8 - 8 $EB3# 8 42EF 8 23#352632#C 8 !$ 8 B3 8 "E2%#E8 2E633 !$A#& 8 8 3#3 8 523 8 !$ 8 B233 8 52B#& 8 8 3 8 4!2#B 8 52B 8 !$69 3# 8 #EF3 8 532#8 ! 3$B!4!$A865993$A!$A85235#8-82E'93F#8!$8#EF38#'85235#8E48&88!#8"E9 8398 E8(!6%98! 3$B!485235#8"E2B86E$#! 32!$A842B32&8 838#36E$ 852B8!$69 3#858 43"852B!693#8E$8"2!B!$A823#35268532#C8B3#!#C83B685$ 8B!#8-8A! 39!$3#8BE8F5%38E28 17832!E 8FE2382E 6B!385$ 83$)E5'93&88#8E8%$E"C8B385'!9!B8BE8"2!B38AEE 8 23#35268532#8!#858#%!998E4B3$8956%!$A85FE$A8E2823#35268#6E952#&8$ 8B38B!2 8 52B89!#B#858$F'32 8E48!A8(59!B 8#E4B"5238BEE9#84E2852!E#885235#89!%38 5B58 F!$!$AC 8 $5B259 8 95$A5A3 8 2E63##!$AC 8 3B6& 8 3#3 8 6EF3 8 "!B 8 499 8 339EF3$B8 3$!2E$F3$B#85$ -E2845!2988BE8 5B3859AE2!BF#8!$8B3823935$B85235#&803!$A8E3$8 #E263C8E865$8$EB8E$98#38!B85#8E89!%3C8'B859#E8F5%38FE !4!65B!E$#8BE8!B&8D38 E38E84!$ 8B3#3823#E263#8#349&
5B8'2!3498EB9!$3#8"5B8"38538!$8#BE2384E28E8!$8B38"E2%#E&88!#86EF!95B!E$8 59#E8!$69 3#86E!3#8E48599823#35268532#8#3936B3 84E2823#3$B5B!E$&8 38"E2%#E8"5#8!$!B!5B3 8'81,,283B38,36!59 8$B323#B822E8E$82B!4!6!598 $B399!A3$638E48B381EFB328,E6!3B8E48$ !54893 8'8!B#81E$3$E2#C8728,85F5$!85$ 8 12E4815,85E85$ 8E"3#8F68BE8B3!286E$#B5$B85$ 856B!38#E2B85$ 83$6E25A3F3$B&8 D385386E#3$8B38 5B38E482!98684E28B38"E2%#EC8BE86E!$6! 38"!B8B38'!2B8 58E48 12E4887525#!F5$C8"E85##3 85"58!$86&812E487525#!F5$8"5#858!E$3328E488!$8 $ !5C 8 5$ 8 #35235 3 8 5 8 $F'32 8 E4 8 !$!B!5B!3# 8 !$69 !$A 8 B3 8 4E2F5B!E$ 8 E4 8 1, 8 5$ 8 3#!A$8E48$ !58#84!2#B8 !A!B5986EFB32&88"5#8325#8B38FE#B85##!E$5B38#')36B84E28 !F8"!B8#3F!$5986E$B2!'B!E$#8!$85BB32$8236EA$!B!E$85$ 895$A5A38'35!E2& 8#36!598$EB38E48B5$%#8BE8728.B2#5FC8"E86E$#B5$B98#3#8#8BE8F5%38B3#38 33$B#823568FE2385$ 8FE2383E93C85$ 82!6328!$86E$B3$B&8 838 !#E281EFF!BB338 F3F'32#8235 !985A233 8BE8#E2B85$ 83+B3$ 3 8B3!28#E2B8"32332823(!23 &88D38 59#E8B5$%8B382343233#84E28B3!28595'9386EFF3$B#8E$8B38532#C8 3#!B38B38328#E2B8 B!F3855!95'93&88 D38B5$%8B38F5$5A3F3$B8E4817189F'5!8*8+36B!387!236BE2C892&8:!58,5(!'8 5$ 87!236BE283 F!$4C892&823E2A3825%598*84E28599E"!$A8B38"E2%#E8BE8'3839 85B8!B#8 23F!#3#8*8!$8B38'35B!49875!89F'5!865F#&8838E2A5$!;!$A8B35FC82!F52!98B38 5 F!$!#B25B!38B35F85$ 8B38<01,8A2E8E4817189F'5!C8538B2!3 8B3!28'3#B8BE8 F5%38B3833$B8#34985$ 83$)E5'93& 5$%# 8 523 8 59#E 8 3 8 BE 8 =1 8 5$ 8 , 8 3,E6!3B59 8 5$ 8 F5$ 8 9!65B!E$# 8 E48 2B!4!6!598$B399!A3$6348!$!B!5B!385B88 325'5 84E284!$5$6!598#E2B8E48B3833$B&8
,5#!%F5289 ,3623B52C8,2
12324225 123453627869A4BC9A85D4EF494E2E433745E494A934353754E745234BC9A85D4EF43F33387434 953FCAAD4967EA3345234FEAAE8746EAA39C342E43F3334934FE452346E7F33763
324C94299
93248C 2
73D4!98AE
828879D94CA978
7854"978
87894E59
#AE894$98C
C28C94$4%998
8974%29798
C78A4C94E99C
&498C9
CD997524!9792355D
'9 84'9A
97994&859
'9924'89A3
DA834935
'929754&E3
$3795324995
12345627892AABCCDD5 EFB523892AABCCDD 12342352367893123456789AB5C2DABE5F5352E 52367A7BC93145AAE5C2DABE5F5352E DEFB3231F7EC93B55 EA9A 5 A73D23FE951456A EA!5 CE7BA7389312345"DB5 2357CFA7E932"25C353#E$AE5 1F73777E79314589A5 67357B7 93145 EA9A 5 623!FEF7A93EA56$E!5 57AF"E7C7732931%&456789AB5 !A77"7317 93345'(A$A5
3B5B892AABCCDD #C7357$FC%3&'8('7CE)
1E77B38E3
8E3DE77 3&'8('7CE)3
1231FBBC78C3
E358%7B3
1235237B"3
DBC3*7BC3
67A77B3+,7E3
D7EB73FE
6735C77B7E7B3
'7B"E7357E3
57CB3DB 7E3
*7C317"78B7E3
57FB37C7 3
2357CFA7E3
57E3-CFA%3
1E7731CA7 3
1234256789AB7C57D5EF8D5A8B58DA5 D54852DD5E42 1 !B54C8"5"5#$4#"5%&5'(F)5
6789AB7C5*878FF !B54C8"555#$4#5%&5'(F)5#FC(A 2342215123467186719ABCDE FC1ACAB 242215124218219ABCDE BAB1B1FC1 A! "#B1$CC411%!&C1'BCAB1(CAB1)1BAB1 24215147186719ABCDE *BC + 1,B-.1%1/#0 471514218719ABCDE %11 42151422183219ABCDE C21(1(CBABC18,CCAB1&1.1,CCAB1&1E 4221516422183219ABCDE *B211 123456789ABCD8E6FA2548B54F674A86 64221516467186719ABCDE 4A2FA889E81223A5428A86B4 64671517447186219ABCDE C21(1(CBABC18,CCAB1&1E 7447151746218719ABCDE %11 123456789ABCD8 18!66A896278A278!66D8 8"62AB8 74621513447186719ABCDE #46$8 !8%5AF8626AB85 B1AC2CCAB1B1$1C21%BC.1'--BC161 342151542183219ABCDE 9-AC 1
1234256789AB7C57D5EF8D5A8B58DA5 D54852DD5E42 1 !B54C8"5"5#$4#"5%&5'(F)5
1AA7D51B( 1AA7D552 +D(,5%8"5387(D5-778"5#$4#5%&5'(F)5#FC(A "A&D8 8"'458A&4F4(42)8!6FF627A5428E56F8'8AC658 421&1472184219ABCDE *AC658 74A8A24C27A8 2289B48A278#4CAF8"74 1! +D8626481256A54368!6)6428 2AB48+AF6$C8 4721&1442184219ABCDE 4FA28E42)8 745A86A48A278#4CAF8"74 1256)A5428'8 , 842)81256BB4)6258"68"BA2242)8 4421&1442184219ABCDE 66-AB489A5CA8A278#62CA568.AFA5 4242)8A27FAC8"A-68 4421&14472184219ABCDE 2289B48A278#4CAF8"74 1AA7D5522 +D(,5.F/"517D5-778"5#$4#5%&5'(F)5#FC(A 782AB1)19B211BAC1AB1,21,AB-C19CAB1 421&1472184219ABCDE 9-B0AC17ACB21:B2AB1C11,8-1'-CAB1%2BA;1 E4243AA8!A85A8A278!AB8EAFA /AB45A54368E-A54AB8!6-6625A5428'8465428'85428 4721&1442184219ABCDE !-AF8*AA8A278EAFA25A8248A(A4CA
1234567898
123456789ABCD 9EFD5A68F2D5A4258AD428F81274A28A2A6D 1234567839ABCCD3EAFA332C5A1A3 33358 ABCDEBF BDB EBF CDECB BF DCBCBF CE DB FBDB! "D CE BFF D C DECB C CEDCDE FBDBB#DFDCE CBECBCBEFBDB!$E%BF& B'DE( BDBFBDBB#E)CBC&BBF(*BDEC+#(CB& CCBCE(C BBE EBBCB#BE EBC DC!4 BE C DCDEF,F(CDDBCF(! BECEBBCBFBBF( BC! -E&FFE#76 CB BE C .DCBCBF BB /EBBE EBE,! 3 BEE B CEBCFE&ECEBCBE%EC E CEEEB CEBC! 2BE EDFC EBE C E EB DF # EC& B BECBCBEBE!
8 8 848186D6A89627D8A2786DF6D886DF2AB846E 12358393 !"5"A#23$CFC5A3%6 653 &33&&358 4EBF DEEC452BE3EB2DEFF#ECCE BEC!$ECBECFFDCFBBFB#FEDE E45EBEEBEBCCEB EDEB E#EBDBCEBE!"BECC EC BFCC 45BCEBCEBEE0C!3FD BEC FF EC B BCB#F DC C BC C , CEC 45 EBEBEBBBCBFBC!
6A2548 B54674A8!6" 123583'5AF#A23$!6!3(E3FF6DA '6F3)A13C!'3 &&33358 3 BC#EBECECBCBBEC # B # BCBFF( CEEC B E #( B C BFCBC DB DC!5CB(1(#EB&BD6DBFBEC BCCCCEBCBFC%C B BCB CE EBC CC! 2B CEECBC DFCB BCB CE ECBF EEBF*BC CBC#!"BC#CF( EFCF(BBCF EFBB#CEBC CEBFEFBC%CDBF BC BBF( DC! 3CF( FBDB F, 4# 3CF( BDB )34+DFDCCEDC EFCEBF6EFBB#CF( D E CEEC C%CDBF DC! 4 BCCC C D CF( E CEEC DFCBCCE#(CBCBCBC%C#CB BCDE BBECDCBCFDCCEDCDEEECCCC CF(!4BEDCBCCCBCEECECDBF%E DB
1234567898 #BCBBECC#CCCDBFBCECDBFEF EBCEBEC BF!3,(CBCE BBCBFB(BE*C F(FBCCDBFBCECDBFEF!5CC%C&EB CF(#BBEB EC%CDBFBCCEECBC DFCBBCB B E C #! 3 CF( EECBC 52DFCB 4# 3CF( BDB )234+6 B %C 34B DEC ECDBF F B EBCBF EBCDFCBBFBC!
ERTAI-2010
ProMax: A Profit Maximizing Recommendation System for Market Baskets
ProMax: A Profit Maximizing Recommendation System for Market Baskets Lydia Manikonda, Annu Tuli and Vikram Pudi Center for Data Engineering IIIT-H Hyderabad, India {{lydia, annu}@research, vikram}@iiit.ac.in
Abstract—Most data mining research has focused on developing algorithms to discover statistically significant patterns in large datasets. However, utilizing the resulting patterns in decision support is more of an art, and the utility of such patterns is often questioned. In this paper we formalize a technique that utilizes data mining concepts to recommend an optimal set of items to customers in a retail store based on the contents of their market baskets. The recommended set of items maximizes the expected profit of the store and is decided based on patterns learnt from past transactions. In addition to concepts of clustering and frequent itemsets, the proposed method also combines the idea of knapsack problems to decide on the items to recommend. We empirically compare our approach with existing methods on both real and synthetic datasets and show that our method yields better profits while being faster and simpler. Keywords-profit mining; recommendations; clustering;
I.
INTRODUCTION
The data mining research literature is replete with several algorithms – most of which are elegant, efficient and effective. However, there are only a few formal studies concerning how data mining can actually be beneficial in more specific targets. A major obstacle in data mining application is the gap between statistically significant pattern extraction and value-based decision making [2]. It is often questioned as to how exactly one should make use of the patterns extracted through data mining algorithms for making effective business decisions, with the ultimate goal of yielding better profits for the business. Similarly, studies about the retail market [1] have received wide attention, although only a few of them have seriously dealt with data mining. Ke Wang et al. [2] first presented a profit mining approach to reduce this gap in 2002 and recent investigations have shown an increasing interest on how to make decisions by utilizing association rules. In this paper we focus on the problem of recommending products to customers in retail stores such that the profit of the store is maximized. Market basket databases contain historical data on prior customer choices where each customer has selected a subset of items, a market basket, out of a large but finite set. This data can be used to generate a dynamic recommendation of new items to a customer who is in the process of making the item choices. Some retail outfits provide carts with displays that provide product information and recommendations as the customer shops. Remote shopping systems allow customers to compose shopping lists through personal digital assistants (PDAs), with interactive recommendations of likely items. Internet
commercial sites often provide dynamic product choices as the customer adds items into the virtual shopping cart, or market basket. Internet sites also display dynamically changing set of links to related sites depending on the browsing pattern during a surfing session. Faced with an enormous variety of options, customers surfing the web gravitate toward sites that offer information tailored to their personal preferences. All these activities are characterized by the progressive item choices being made by a customer, and the providers' desire to recommend items that are the most likely next choice of the customer [14]. From the market angle, two important criteria [5] should be taken into account during the process of mining profit: the items in retail shops should first meet the basic sale request, and second, should bring higher profits. Therefore, how to meet these two principles is the core problem of profit mining. The cross-selling effect of items [1] has been noticed by current retailers: the profit of an item is not only involved in the item itself, but is also influenced by its related items. Some items fail to produce high profit by themselves, but they might stimulate customers to buy other profitable items. Consequently, the cross-selling factor which can be studied by the analysis of historical transactions should be involved in the problem of item selection. Searching for such a relation of items to support crossselling has become an important issue. Current approaches to study these relationships are based on association rules. However, association rules by themselves do not suggest how to maximize profit. In this paper, we present an algorithm that combines simple ideas from clustering, recommendation systems, and the knapsack problem to recommend those items to customers that maximize the expected profit. We tested our algorithm on two popular datasets: One was generated by using the data generator from IBM Almaden Quest research group [13] and the other was a retail market basket dataset available on the FIMI repository [15]. Our experiments show that our algorithm is performing better than the competing algorithms in terms of obtaining better profits, while at the same time being faster and simpler. This paper is organized as follows: In section II, we introduce the related work of profit mining. In Section III, we describe the problem statement. In section IV we discuss our algorithm. Detailed experimental results are presented in Section V. In Section VI, we draw conclusions and present future work. II. RELATED WORK Many novel and important methods were proposed to support profit mining. In 2002, profit mining approach and
1
ERTAI-2010
ProMax: A Profit Maximizing Recommendation System for Market Baskets
other related problems [2], [6] were presented by Ke Wang et al. The webpage-layered algorithm HITS [8] was extended as HAP algorithm [7] by Ke Wang et al, to solve the problem of item ranking with the consideration of the influence of confidence and profit, which has still several drawbacks [9]. In order to mine a subset of items with the maximal profit while improving the above drawbacks, the maximal profit problem of item selection (MPIS) [9] was proposed by Raymond Wong et al. However, MPIS is too difficult to implement and solves a NP-hard problem even in the simplest situation. In other words, although MPIS algorithm could find the best solution, the cost of time is too expressive to be tolerated. By considering the cross-selling effect in order to solve the problem of Item Selection for Marketing (ISM), Hill Climbing method [4] was recently proposed by Raymond Wong et al. Raymond Wong et al. [10] also adopted genetic algorithms to generate local optimal profit to fit the optimal profit of the item selection problem. The DualRank [12] algorithm uses a connected graph made out of the association rules. If the minimum support is increased, then the out-degrees for the items decreases which leads to the compression of the graph. In general if the support of items is decreased then the number of association rules will keep increasing. Matrix calculations done are affected which then affects the item selection based on the profit. Another problem with DualRank [12] is that it is not very efficient for sparse data sets and it includes cumbersome calculations of eigenvalues and eigenvectors which become intractable for large transactional data sets. To overcome all these situations, in this paper we have developed a new algorithm for market basket datasets. III. PROBLEM DEFINITION In this paper we focus on the problem of recommending products to customers in retail stores such that profit of the store is maximized. The following definition formally captures the problem statement: Definition 1 (Profit Mining): Given a transactional dataset D = {t1, t2, .., tm}, where each transaction ti contains a subset of items (itemset) from the set I = {i1, i2, .., in}, each item ij having an associated profit pj, the problem is to select k additional items to recommend for each transaction, with the goal of making more sales and maximizing the total profit of the resulting transactions. There are several challenges to this problem: 1) Model product purchase probabilities: We need to recommend products that are more likely to be purchased. It is therefore important to have a clear understanding of how to model the probability of purchase of each product. 2) Model product relationships: It is not sufficient to know the individual purchase probabilities of different products. We also need to identify relationships between items for cross-selling. The standard technique in recent approaches for this purpose has been to use association rules. 3) Model customers: Even high-confidence association rules may not apply to a particular customer, whereas some low-confidence rules may apply. Thus, it is imperative to model customers regarding which category they belong to and then study the rules within that
category. This is more likely to result in effective recommendations. The standard technique in the recommendation system community for this purpose is to cluster customers such that customers within each cluster share the same purchase patterns. 4) Balance purchase probability and profit: A pure association rule based approach will favor rules with high confidence so as to maximize the probability that the customer will purchase the recommended item. For example, the rule {Perfume} 1 {Lipstick} will be favored because of higher confidence compared to a rule {Perfume} 1 {Diamond}. In contrast, a pure profit-based approach will favor the later rule hoping for higher profit. Neither necessarily maximizes the true profit. Indeed, items of high profit often also have low supports because fewer people buy expensive stuffs. 5) Decide the number of products to recommend: An implicit constraint here is that if we recommend too many items then the customer will be overwhelmed by the choices and is likely to avoid choosing any item at all. On the other hand, if we recommend too few items, then we may miss a successful sale of a product that has not been brought to the attention of the customer. The correct number of products to recommend depends on the attention span of customers and would vary depending on the domain. IV. THE PROMAX ALGORITHM As discussed in the previous section, at the face of it, the problem of recommending the right products is replete with several challenges. It therefore seems daunting to design a simple yet effective algorithm for this task, so much so that to design an optimal algorithm that guarantees to maximize the expected profit, seems out of reach. In this section we present our algorithm ProMax, a Profit Maximizing recommendation system for market baskets. Our algorithm performs a clustering of the customer transactions as a preprocessing. For this purpose, it uses the clustering algorithm in [11]. Next, at the time of recommendation, the algorithm is based on the following steps: 1) Identify the cluster C to which the current record belongs 2) Calculate the expected profit of each item in the cluster C 3) Sort the items based on expected profit and recommend the top k items, where k is a parameter to the algorithm. A. Clustering Customer Transactions and identification of the current cluster Since the probability of earning more profit is directly proportional to the purchase probability of items, it is imperative to be able to accurately estimate the purchase probability of specific items by specific customers. A naive solution is to use the global support of items as an estimate of their probability. But, as customers differ in their purchase patterns, the global support of an item is an unreliable estimator of its likelihood of sale.
2
ERTAI-2010
ProMax: A Profit Maximizing Recommendation System for Market Baskets
Therefore, a natural approach is to first cluster the transactions based on their purchase patterns and then use the support of items within a cluster as more accurate estimates of their purchase probabilities. The clustering criterion we use is based on the notion of large items and was proposed in [11]. For a given collection of transactions and a minimum support, our aim is to find a clustering C such that cost(C) is minimum. cost(C) is calculated by using the intra cluster similarity and inter cluster similarity which are calculated by using the notion of Small Items and Large Items respectively. For a user-specified minimum support 1 (0 < 1 2 1), an item is large in cluster Ci if its support in Ci is 3 1×Ci; otherwise, an item is small in Ci. Intuitively, large items are popular items in a cluster and thus, contribute to similarity of items within a cluster. In contrast, small items contribute to dissimilarity in a cluster. Let Largei denote the set of large items in Ci, and Smalli denote the set of small items in Ci. Consider a clustering C = {C1, C2, …, Ck}. The cluster to which the current record r belongs to, depends on the cost which can be calculated using the equation. It can be defined mathematically as.
Cluster(r ) = arg mini [Cost(Ci )]
(1)
The cost of C to be minimized depends on two components: the intra-cluster cost and the inter-cluster cost: Intra-cluster cost: This component is charged for the intra-cluster dissimilarity, measured by the total number of small items: Intra
(C ) = 1
k i =1
Small
(2)
i
This component will restrain creating loosely bound clusters that have too many small items. Inter-cluster cost: This component is charged for the inter-cluster similarity. Since large items contribute to similarity in a cluster, each cluster should have as little overlapping of large items as possible. This overlapping is measured as: k
Inter (C ) = 1 L arg ei − i =1
1
k
i =1
L arg ei
(3)
In words, Inter(C) measures the duplication of large items in different clusters. This component will restrain creating similar clusters. To put the two together, one can specify weights for their relative importance. The cost function of the clustering C then is defined as:
Cost (C ) = w × Intra (C ) + Inter (C )
(4)
A weight w > 1 gives more emphasis to the intracluster similarity, and a weight w < 1 gives more emphasis to the inter-cluster dissimilarity. By default w = 1.
The pseudo-code of the clustering algorithm described above is shown in Algorithm 1. ALGORITHM 1. CLUSTERING ALGORITHM
/*Allocation Phase*/ While not end of file do read the next transaction
; allocate t to an existing or a new cluster Ci to minimize Cost(C) write ; end while /*Refinement Phase*/ repeat not_moved = true; while not end of the file do read the next transaction ; move t to an existing non-singleton cluster Cj to minimize Cost(C); if Ci 4 Cj then write ; not_moved = false; eliminate any empty cluster; end if end while until not_moved; B. Calculating Expected Profit Once the clusters are computed, the probability of an item's purchase can be found by first identifying which cluster the current transaction falls into and then looking up the support of the corresponding item in that cluster. That is, the item's probability is estimated as its support within a cluster, rather than its global support. As mentioned in Section IV-A, this manner of computing the probability of items is far more accurate in terms of their likelihood of purchase. In this context, we compute the expected profit of a given item i with the help of its probability (intra-cluster frequency) f and profit p. The expected profit for each item can be computed as:
E i = f i × P i 5678 Recommending Items: Knapsack Approach The current scenario is that we can recommend k items to the customer where k is a constant chosen based on the domain. The goal is to maximize profit – so we must ensure that the recommended items have high purchase probability and high profit. There are mainly two options: 1) Recommending items with high purchase probability: Consider the example of lipstick and diamond. Suppose the cost of lipstick is $1 and cost of diamond is $500 and purchase probability of lipstick is 0.03 and diamond is 0.0001. When we consider the purchase probability as the only case, we will be recommending lipstick which would not yield high profit. Recommending items with high profit: Consider the same example as given above, suppose the cost of lipstick is $2 and cost of diamond is $500, diamond has higher profitability than lipstick. When we consider only the high
3
ERTAI-2010
ProMax: A Profit Maximizing Recommendation System for Market Baskets
profit, then we will recommend diamond which is not a good recommendation. If the goal is to maximize either purchase probability or profit separately as mentioned above, we could directly use the knapsack approach [3]. We reduce the current problem to the knapsack problem in the following manner. Definition 2 (Knapsack Problem): Given a set of items, each with a weight and a value, determine the number of each item to include in a collection so that the total weight is less than a given limit and the total value is as large as possible. We reduce profit mining to a knapsack problem using the following equivalences: • Items to carry in a knapsack correspond to the items for sale in profit mining. • Item weight (in knapsack problem) is set to 1 for all items in the store. • Weight limit (knapsack problem) is set to k – the number of items to recommend(in profit mining) • Item value (in knapsack problem) is set to expected profit of that item (which is purchase probability 2 item profit) The result is a greedy algorithm to recommend items – Once the expected profit of items is computed; we sort all items in decreasing order of these expected profit values and recommend the k items which have the highest expected profits. The knapsack approach guarantees that this greedy approach maximizes the overall expected profit. The pseudo-code of the resulting algorithm as described above is shown in Algorithm 2. ALGORITHM 2. MINING PROFITABLE ITEMS
Identify the cluster to which the user’s transaction belongs I = Items bought by the user for not end of all the clusters do i = cluster id w = weight function Ci = ith cluster Cost(Ci) = w × Intra(Ci) + Inter(Ci) end for min = minimum cost value minid = id of the cluster which has the minimum cost while not end of the transactions in the cluster having min cost do f = frequency of each item i p = profit of item i Ei = fi × pi (expected profit) end while sort the items in descending order of expected profits select k items with maximum value from the sorted list V.
EXPERIMENTAL EVALUATION
To evaluate the performance of the ProMax algorithm we ran a set of experiments on two data sets: a retail data set [5] available from the Frequent Itemset Mining Implementations (FIMI) Repository [15] and a Synthetic Dataset [13]. The choice of these particular data sets is not restrictive since they are widely used standard benchmark data sets, and the structure is typical of the most common
application scenarios. In fact we can apply this algorithm in all scenarios provided we have the past transactions and a current customer transaction. We compare the efficiency of ProMax algorithm against the DualRank algorithm [12], because the DualRank algorithm has already been shown to perform well when compared with the HAP algorithm and the naive approach. The DualRank algorithm [12] is the state of art in the consideration of item selection methods with customer-behavior model in the data mining literature but ProMax algorithm is based on the customer-item-behavior model. All experiments are conducted on a 1.60GHZ Intel PC with 8GB main memory, using a Linux machine. Both algorithms were coded in C++. Profits of items are generated randomly since the way profit is generated does not affect the nature of the results generated. To perform the experiments, we took a set of four to five items which is a current customer transaction as an input and recommend top five items to the user as an output based on the past transaction history. A. Performance of DualRank Initially we have considered the performance of DualRank on the synthetic dataset. For a larger number of transactions, DualRank was not able to execute fast enough due to the huge number of computations it has to perform. Performance of DualRank is shown in the graph of Fig. 1. For a minimum support greater than 0.1, DualRank could not be executed. The reason behind this is that, since there are very few frequent singleton items, there are no edges in the item-graph created in DualRank. We notice that the performance of DualRank deteriorates as the min-support value is reduced. The reason is that for low minimum supports, the number of association rules generated is very large resulting in a very large matrix. This makes it difficult to perform the intensive matrix operations that DualRank requires such as the calculation of eigen values and eigen vectors. DualRank’s recommendations are independent of a particular input transaction, but are globally determined by the overall profitability of individual items. ProMax algorithm was also evaluated under the same conditions as DualRank. Performance of ProMax is shown in the graph of Fig. 3. We notice that it performs better over the entire range of min-support values that DualRank could run on. We notice that its performance increases slightly when the min-support is very low. This is only coincidental – there is as such no particular direct relationship between the min-support value and the profits generated by ProMax. This is because here the minsupport can only affect the quality of clustering by modifying the intra-cluster and inter-cluster costs. Both too low and too high values of min-support could deteriorate the clustering quality. However, there is a damping affect – when min-support is reduced, there are more small items leading to a high intra-cluster cost and a low inter-cluster cost. The increase in intra-cluster cost is often compensated by the decrease in inter-cluster cost thereby resulting in no net change in cluster quality. We have also noticed that by keeping the min-support value as constant and varying the number of items being
4
ERTAI-2010
ProMax: A Profit Maximizing Recommendation System for Market Baskets
VI. CONCLUSIONS AND FUTURE WORK
Figure 1. DualRank Performance
selected, ProMax outperforms the DualRank algorithm as shown in the Fig. 2. DualRank always generates the static recommendations and is independent of the customer's current transaction. Hence, until the database of previous transaction history changes, DualRank always recommend the same items. B. Performance of ProMax In this experiment, we evaluated the performance of ProMax on both the real and synthetic datasets. The results are shown in the graph of Fig. 3 In this graph, the x-axis denotes the min-support, whereas the y-axis denotes the profit generated by ProMax. As per our observation, the algorithm is performing in the same manner across different datasets. For datasets which are not very sparse, profit is comparatively more. This is because of the clustering approach where the bulkiness of the clusters increases. Also, the clustering quality effect described in the previous experiment is clearly visible in the real dataset curve. Notice that this curve has a peak at around minsupport = 0.05 when clustering quality is high and
In this paper, we presented an algorithm that combines simple ideas from clustering, recommendation systems, and the knapsack problem to recommend those items to customers that maximize the expected profit. We tested our algorithm on two popular datasets and compared our algorithm with the previously existing DualRank algorithm. Our experiments showed that our algorithm performs better than DualRank in terms of obtaining better profits, while at the same time being faster and simpler. We believe that there are few aspects to extend the current work where in the initial phase of our algorithm, clustering technique can be done more efficiently by appropriately identifying the parameters while calculating the cost. In our algorithm, the quantity of the items was not considered but only their type, which can be extended as a future work. REFERENCES [1]
[2]
[3] [4]
[5]
[6]
[7]
[8]
[9] Figure 2. Comparison of profits earned by the algorithms based on the number of items selected [10]
[11]
[12]
[13] [14] Figure 3. ProMax performance on different datasets
deteriorates on both sides as the min-support is either increased or decreased.
[15]
Brijis Tom: “Retail market basket analysis: A quantitative modelling”, Ph.D dissertation. Faculty of Applied Economics, Limburg University Center, Belgium, 2002, pp. 107-109 Ke Wang, Senqiang Zhou and Jiawwei Han, “Profit Mining: From patterns to actions”, Proc. of the 8th International Conference on Extending Database Technology, 2002, pp. 70-87 Ellis Horowitz and Sartaj Sahni, “Fundamentals of Computer Algorithms” W.H.Freeman & Company, 1984 Raymond Chi-Wing Wong and Ada Wai-Chee Fu: “ISM: I tem selection for marketing with cross-selling, PAKDD, 2004, pp. 431440 Tom Brijs, Gilbert Swinnen, Koen Vanhoof and Geert Wets, “Using association rules for product assortment decisions: a case study”, Proceedings of the fifth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, 1999, pp. 254-260, 1-58113-143-7 S Zhou and K Wang: Profit Mining, to appear in Encyclopedia of Data Warehousing and Mining, Nature, Vol. 400, 2004, pp. 107109 Wang, Ke and Su, Ming-Yen Thomas, “Item Selction by “hubauthority” profit ranking”, Proc. of the eight ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, 2002, 1-58113-567-X, pp. 652-657 Sergey Brin and Lawrence Page, “The anatomy of a large-scale hypertextual Web Search Engine”, Comput. Netw. ISDN Syst. 30th Volume, 1998, pp. 107-117 Raymond Chi-Wing Wong and Ada Wai-Chee Fu and Ke Wang, “MPIS: Maximal-profit item selection with cross-selling considerations”, Third IEEE Conference on Data Mining, 2003, pp. 371-378 Raymond Chi-Wing Wong, Ada Wai-Chee Fu, Ke Wang: “Data mining for inventory item selection with cross-selling considerations”, Data Min. Knowl. Discov., Vol. 11, 2005, pp. 81112, ISSN 1384-5810 Ke Wang, Chu Xu, and Bing Liu, “Clustering transactions using large items”, Proc. of the eight International Conference on Information and Knowledge Management, 1999, ISBN 1-58113146-1, pp. 483-490 Xiujuan Xu, Lifeng Jia, Zhe Wang and Chunguang Zhou, “Dual Rank: A dual-phase algorithm for optimal profit mining in retailing market”, ASIAN, 2005, pp. 182-192 Rakesh Agrawal : “IBM Synthetic Data Generator, http://www.almaden.ibm.com/cs/quest/syndata.html ”, 2004 Se June Hong, Ramesh Natarajan and Ilana Belitskaya, “A New Approach for Item Choice Recommendations”, Proc. of the Third International Conference on Data Warehousing and Knowledge Discovery, 2001, pp. 131-140 FIMI Repository http://fimi.cs.helsinki.fi/data
5
ERTAI-2010
GIRAF : Generic, Interactive, Regression Analysis Framework
GIRAF : Generic, Interactive, Regression Analysis Framework Himanshu Singh, Aditya Desai and Vikram Pudi Center for Data Engineering, International Institute of Information Technology, Hyderabad. Hyderabad, India. {himanshusingh,aditya_desai}@students.iiit.ac.in, [email protected].
Abstract—Regression modeling is a major area of data mining that has been extensively studied in the past. These models aim to estimate the value of a dependent numeric variable based on the values of one or more independent variables thus being useful in prediction (including forecasting of time-series data), inference, hypothesis testing and modeling of causal relationships. Unfortunately, in-spite of years of research in the area of regression modeling, there exists no single best solution for all dataset domains primarily because of the infamous no-free-lunch theorem. In this paper we propose an expert-oriented interactive framework for regression, GIRAF - Generic, Interactive, Regression Analysis Framework. GIRAF alleviates the task of manually labeling tuples by selecting a subset of high quality tuples which now need to be labeled by domain experts for a model with relatively high accuracy. Our experimental study demonstrates both low error rates and high effort savings on expert’s part, as an accurate predictor can now be constructed with a small amount of quality labeled data. Keywords-Regression, Interactive Data Mining, Learning, Online Data Streams, Ensemble Learner.
I.
INTRODUCTION
Regression modeling is a major area of data mining that has been extensively studied in the past. The goal of a regression model is to estimate the value of a dependent numeric variable based on the values of one or more independent variables. Regression algorithms can be used for prediction (including forecasting of time-series data), inference, hypothesis testing and modeling of causal relationships. The current set of regression algorithms use a train-test model where a model is built using the training data and evaluated against a test data. These algorithms don’t provide support for incorporating knowledge provided by domain experts thus leading to loss of knowledge that could have been utilized for building an efficient and accurate model. Additionally, these algorithms come up with bulky models which are inefficient and require lot of computation power. Ideally, we require algorithms that incorporate knowledge of domain experts, thus building systems that combine the best of computer processing and human intelligence. It may be known that the amount of data available in some application domains is scarce. In such cases large amount of synthetic data is generated and is then manually tagged by domain expert. There may also be domains where a large of amount of data is present but is unlabeled and hence needs to be tagged. This step generally involves human involvement and hence is extremely costly and time consuming. In this work we reduce the amount of effort required by domain expert to build a high quality
prediction engine where only a subset of tuples need to be tagged for equivalent performance. In industrial settings each training point may take days to gather and thousands of dollars thus making the method of optimally selecting high quality data points important. Unfortunately, in-spite of years of research in the area of regression modeling, there exists no single best solution for all dataset domains primarily because of the no-freelunch theorem [1]. Also, the process of developing regression algorithms is separate from its actual application which is mostly done by groups of domainspecific researchers which can take full advantage of their special areas of expertise. Researchers possess varied skills, intelligence, cognitive styles and levels of tolerance of frustration, and approach a problem with diverse preferences, requirements and background knowledge. It has been evident that computer systems are exceptionally good at crunching numbers, producing exact parametrization and exploring large numbers of alternatives. Thus, if we can combine the best of human and computer processing, we should be able to develop systems that are superior to all existing approaches which motivates and justifies the field of interactive data mining [2][3][4]. Interactive data mining (IDM) is a field of machine learning which focuses on efficient human-computer interactions for data analysis purposes. This field has been explored extensively in the past few years subjecting it to both domain and normal users. This process of continuous knowledge exchange between a human expert and a computer, effectively resulting in the discovery of new and unique knowledge arising primarily because of expert intervention is termed as interactive learning. This process of interactive learning is dynamic and hence not confined to a fixed training size. Thus, the main feature of an interactive learner lies in grasping user preferences quickly and applying it for effective decision making in the future. We thus define interactive learning as the process of embedding experts cognitive inputs in standard regression models to come up with a more accurate model and hence output. In this paper we present GIRAF − Generic, Interactive, Regression Analysis Framework, a novel approach to interactive regression mining. GIRAF has following attractive and desirable features: • GIRAF framework follows the concept of plugin − plugout, i.e. any regression algorithm can be plugged into it, thus making it generic across regression algorithms and dataset domains. • The interactive nature of the framework allows for seamless integration of experts’ domain knowledge and hence making it useful for almost all practical applications involving prediction.
1
ERTAI-2010
•
•
•
•
GIRAF : Generic, Interactive, Regression Analysis Framework
As the domain expert decides on the kind of data that can be added, the expert can study the behaviour of the system when the data used for building model is either noisy and or noise-free. Data mining techniques like classification and regression require a training set where a label is associated with every tuple. In domains like medical informatics obtaining such training data is a difficult and challenging task. In such cases, a large amount of training data is synthetically generated and then manually tagged by domain experts. However, such an approach requires large amount of human effort and time. This is the motivation for our study, where the domain expert is able to build a high quality model by tagging a relatively small number of samples. The framework can also be modeled as an ensemble based learner as two or more predictor models can be combined to give a resultant model more powerful than each individual model. The resultant model may judge the output on the basis of boosting or alternative methods may be proposed. One of the alternatives may be to plug the output of one regression model into the input of other(s), if the confidence in prediction is below a pre-determined threshold. The decision to combine and mix models is application-oriented and left entirely to the domain-expert. Another useful modeling of this framework can be for a live data stream, where a model is initially created and is subsequently refined according to the input stream, to incorporate domain knowledge for accurate and effective prediction.
The remainder of the paper is organized as follows: In Section 2 we describe the regression problem and the interactive data mining paradigm. We then discuss related work in Section 3. In Section 4 we present the GIRAF algorithm. Then, in Section 5 we describe the applications of our framework and experimentally evaluate it. Finally, in Section 6, we summarize the conclusions of our study and identify future work. II.
PROBLEM FORMULATION
In Section 2.1 we formulate the regression problem and define the Interactive regression model in Section 2.2. A. Regression The problem of regression is to estimate the value of a dependent numeric variable (also called response variable) based on the values of one or more independent variables (also called feature variables). Formally, the input to the problem consists of: • A vector of n feature variables X = (x1, . . . , xn), forming an n-dimensional space. • A numeric target response variable y • A training set of m data samples D = {(X1, y1),..., (Xm, ym)}, where Xi are points in X-space and yi are the corresponding values of the response variable. The output is an estimate of the value of y for new input points in X-space. Statistical approaches model the
relationship between the dependent and independent variables as a closed form function: y= F (X,1) Where 1 is a vector of unknown parameters whose values are determined by fitting the above equation to the training set. B. Interactive Regression Model We formulate the interactive regression model on the basis of the standard regression problem described above 2.1 with few modifications. In the standard regression problem, the algorithm passes through two basic stages. In the first stage, a model is built from the available training data and in the second stage the model is evaluated against a test data. In the proposed model, we divide the dataset into following sets: • The first set of data is called the train data and is used for building the initial model using any regression algorithm depending on the users requirement. This step is equivalent to the training step of any regression model. • The second set of data is called the refine data. The model built using the previous train set is used for prediction of tuples in this set. Every predicted value has a confidence value which reflects the confidence of the predictor in the prediction of the given tuple. If this confidence value falls below a specified threshold, this tuple is incorporated in the train set along with its dependant variable value and the model is modified to incorporate the changes, else the tuple is ignored. Note that the tuples in the refine set may not be labeled initially and the expert is called upon to label only those tuples for which the confidence prediction is less than a specified threshold. • The third set of data is called the test data and is used for evaluating the built model. This step is equivalent to the testing step of any regression model. III.
RELATED WORK
In this section we describe work related to the new regression framework that we propose in this paper. Online Interactive Data Mining (OIDM) [3] is an interactive mining tool which combines multiple functions of querying the system, taking algorithmic recommendation and summarizing the results. This tool provides support for a large range of classification, association rule mining and clustering Algorithms. However, there is no regression algorithm supported by this tool. OIDM basically uses the information gathered by the user to recommend the correct algorithm for the data mining task. Yan Zhao, Yiyu Yao and Mingwu Yan have built an Interactive Classification System (ICS) [2] which finally outputs a set of rules for classification, represented as a granule tree. They have proposed that, interactive data mining is more about the adapativeness and effectiveness of the system rather than overemphasizing the automation and efficiency of the system. Yao and Shi [4] have presented a user centered view of interactive data mining which enables the user to create a reasonable and meaningful cognitive structure. The study
2
ERTAI-2010
of human-computer interaction which involves domainexpert judgment in the whole process of data mining is presented here. Subsequently all the three user requirements and preferences are mapped into philosophy layer, technique layer and application layer respectively. Weka [5] is another famous interactive data mining tool. It contains tools for data preprocessing, regression, clustering, association rules, classification and data visualization. Another approach to human-computer interaction [6] Is in the field of data visualization. Here after visualizing the available data, appropriate data is selected and the desired data mining algorithm is applied to it. Haiku [12] is another Interactive Data Mining (IDM) tool which combines the concept of visualization and interestingness for knowledge discovery. In Dynamic Graphic Regression [7], the authors have built software for interactive regression analysis. The claim is that many researchers collect a dataset for a specific purpose, perform the planned form of analysis using a computer package, and report the results, never having actually looked at the data. In this paper they used this data to select the appropriate analysis procedure. Active learning is a much studied field in subset selection where the problem that is solved is selecting a subset of the data for building an accurate model. However, the approaches used in solving this problem require that the entire data be tagged [8]. Thus, this problem is different from the problem tackled in this paper, where on being presented a large set of unlabelled tuples we present the domain expert with a small set of tuples which now need to be labeled for building an efficient model. In addition to this there exists works like [9] [10] which perform subset selection without the data being labeled. However, these methods have been derived for classification where a structure has been derived for data by assuming that the classes are linearly separable to a large extent which is not possible for regression. Also, these methods suffer from the curse of dimensionality and hence large number of points are sampled for highdimensional spaces as majority of the subspaces pose high uncertainty. It may be noted here that the advantage of our framework renders is the ability to set a confidence measure in a manner which makes the tuple selection algorithm relatively robust to the number of dimensions. The concept of fuzzy logic has been extensively used in the field of OIDM. In classification and regressionbased surrogate model [11] the authors have proposed a surrogate model assisted one to alleviate user fatigue by building a classifier and a regressor to approximate the user’s cognition. Two reliable training data sets are obtained based on the user’s evaluation credibility. Then a support vector classification machine and a support vector regression machine are trained as the surrogate models with these samples. Specifically, the input trained samples are the individuals evaluated by the user, and the output training samples of the classifier and the regressor are widths and centers of these individuals fuzzy fitness assigned by the user, respectively. These algorithms only aid experts who want to derive knowledge from data and hence expert or domain knowledge can be applied to models only at the start or the end of the procedure. Most of these algorithms are domain
GIRAF : Generic, Interactive, Regression Analysis Framework
specific and the generic ones are developed only for classification, association rules and clustering analysis. In our paper we present an adaptive regression framework with real life applications where the domain-expert’s knowledge can be continuously used to modify and develop the model according to the incoming input stream of data. The requirement for expert support decreases as the model matures, as is evident through experimental results. IV.
THE GIRAF FRAMEWORK
In this section we present GIRAF - Generic, Interactive, Regression Analysis Framework. GIRAF follows the concept of plugin-plugout i.e. any regression algorithm with minor modifications (subsection 4.2) can be embedded in this framework. The basic framework of GIRAF is depicted in Figure 1. In the following subsections we describe each step of the GIRAF framework.
Figure 1: Working of the GIRAF Framework
A. Building Initial Model The first step of this framework is equivalent to the train step of the standard prediction procedures which follow a train-test paradigm. Typically in a train-test paradigm, a model is built initially using the training data, followed by a test step in which different records of test set are presented for which the value of dependent variable is predicted using the model. As the value of the dependant variable is known for these tuples, the error in prediction can be computed. In this step of GIRAF, we start with a very small amount of train data. For our experiments which are described later in Section 5.1 we take 5% of the total dataset as the train set. However, it may be noted that, a smaller dataset could have been used. The values of the dependant variable are determined by domain experts for tuples of this set and are included in the train dataset. After this, a user-supplied regression algorithm is run on this dataset and the corresponding model is built.
3
ERTAI-2010
GIRAF : Generic, Interactive, Regression Analysis Framework
B. Prediction Using Model Here, we sequentially read the records of the refine set and for each record encountered, the value of dependant variable is predicted according to the built model and a confidence value is associated with the prediction. The confidence measure can be considered to be a measure of ambiguity associated with predicted value. For e.g., consider the case of a k-Nearest Neighbour predictor where a particular tuple is assumed to be influenced only by its neighbourhood. We assign a low confidence value to tuples which have less number of points in the said neighbourhood (However other confidence measures may be used for determining ambiguity). These parameters are defined by domain experts having sufficient knowledge and expertise in the field. It may be noted that the task of selecting an appropriate confidence measure and the relevant parameters lie entirely with the domain experts. It may also be noted that the framework allows a combination of two or more predictor models to give a resultant model which may be more powerful than each individual model. The resultant model may judge the output on the basis of boosting or the output of one model may be fed to the input of other if the confidence of the former is below a specified threshold. The decision to combine and mix models is left entirely to domain experts. C. Feedback by Domain Experts and Updating Model Before we move to determining how the feedback of domain experts can be incorporated we define a confidence measure threshold 1. The confidence measure threshold can be defined as follows – If the confidence of prediction of a tuple is greater than 1 then no learning takes place. However if the confidence value is less than 1 then the tuple is labeled by the domain experts and this knowledge is incorporated into the regression model through a user defined update function. After this step, we proceed to read the next input record in the refine set and repeat the steps as in the previous section. D. Testing After relevant knowledge from the train and refine set has been obtained the model is evaluated on the test set. This step is equivalent to the test step of the standard prediction procedures which follow a train-test paradigm. Here, the value of the dependant variable is predicted for records of the test set using the model and the corresponding root mean square error is determined. V.
INSTALLING GIRAF
labeled. Thus, the work of domain expert is reduced considerably as the amount of data labeled by the expert is reduced. We use standard datasets obtained from UCI [16] and the Statlib [17] library and then randomly divide the entire data into independent train, refine and test sets. We have used four standard datasets namely Abalone [16], Concrete [13], Stock [17] and Space [14] from these repositories whose details are presented in Table I. To simulate interactions with domain experts, we use the value labeled in the dataset as the value suggested by domain experts. However this work will be applied only when we have to select a subset of unlabeled training data to be tagged. As PAGER is a kNN based approach, it uses the concept of kNN which assumes that the value of a point is highly affected by its surrounding neighbours. Thus, kNN is bound to give poor results if the density of points in the neighbourhood of the test tuple is low (less than a specified threshold). For our experiments we normalize the value of independent variables so that they lie between 0−1. The confidence measure for prediction is defined as –
1−
Dis tan ce ( k th ) (Number of Independen t Dimensions )
where, Distance(kth) is the distance of the kth neighbour from the input tuple. For our experiments, we take the value of k as 10. The confidence threshold (1) is set as follows :
1−
1
trainsize / 2
1
Dis tan ce ki
(trainsize /2) * (Number of Independen t Dimensions )
Where, trainsize is the size of the training dataset. Distki is defined as follows We iterate over all training points and compute the distance of the kth nearest neighbour for all these points. Thus, Distki is the ith minimum of these distances. The intuition behind these confidence measures is that if the density of points in the neighbourhood is low, the distance of the kth neighbour will be high and hence the confidence value will be low which is in-line with the concept of nearest neighbour predictors. TABLE I. DATASET DESCRIPTION USED IN OUR EXPERIMENTS Dataset
Number of numeric attributes
Size
Source
Abalone
8
500
(16)
After In this section, we present the utility of GIRAF by applying it to a series of real-life domains.
Concrete
8
1030
(13)
Stock
10
950
(17)
A. Experimental Settings It has been seen in [15], that PAGER was found to be more accurate or comparable to most of the other regression algorithms on a wide range of datasets. Hence, for demonstrating the effectiveness of GIRAF, we plugin PAGER in the GIRAF framework and show that the mean error rates for the resultant model are smaller than PAGER even though the former requires less data to be
Space
7
3107
(14)
B. Results We divide the entire dataset into the train set (approximately 5%), refine set (approximately 85%) and the test set (approximately 10%). To compare PAGER in
4
ERTAI-2010
GIRAF : Generic, Interactive, Regression Analysis Framework
the GIRAF framework with PAGER in train-test paradigm the following procedure is followed: • We run the PAGER in the GIRAF framework and note the error in prediction on the test set and also the number of queries made to the domain expert. • We now randomly add tuples from the refine set to the train set of PAGER, train PAGER on this set and evaluate PAGER on the test set. We stop this procedure when the mean error rate of PAGER becomes approximately equal to the mean error of PAGER in the GIRAF framework. • As the tuples were added randomly, there was a large variation in error and hence an average of the observations has been taken. C. Discussion of Results In this section we provide a brief description of our experimental methodology for Abalone dataset. A similar methodology has been used for other datasets. The results are summarized in Table II. For Abalone dataset, we use 200 tuples each for training and testing and the rest for the refine set. We set the confidence value to 0.96. The root mean square error was found to be 2.046 and the number of queries to the domain expert was found to be 123 when PAGER was plugged into GIRAF and hence the total number of tuples tagged by domain expert was 323 (200 train + 123 refine). After this, the data from the refine set was added to training set of PAGER and after training, the model was evaluated on the test set. Initially a PAGER model was built on the train set of PAGER comprising of 200 tuples. After this, 123 tuples of the refine set (equal to the number of queries to expert) were added to the train set (total number of tuples in train set now 323) and the model was then evaluated. The size of the training dataset was then increased in multiples of 100 by adding tuples from the refine set to the train set and the mean error rate of standalone PAGER was evaluated on the test set. When the number of tuples in the train set of PAGER reached 673, approximately 350 more than the amount of tuples required by PAGER in GIRAF framework, the accuracy of standalone PAGER was found to be approximately equal to PAGER in the GIRAF framework. It is also evident from Figure 2 that the number of queries to domain experts decreases with increasing training using the refine data which indicates that the model has matured and thus predicts a higher number of tuples with a confidence higher than threshold. The higher confidence in prediction of the tuples, in later parts of the refine step, stems from the fact that with time, as points are included in sparsely populated neighbourhoods and hence very few regions exist which have insufficient points for (high quality) prediction.
Train Size
Test Size
Conf. value
No. of queries
Total tagged
Abalone Concrete Stock Space
200 50 50 100
200 100 50 100
0.96 0.93 0.95 0.95
123 183 103 202
323 233 153 302
Size of random Sample 723 307 178 452
CONCLUSION
REFERENCES [1] [2]
[3] [4]
[5]
[6] [7] [8] [9] [10]
[11]
[12]
TABLE II. EXPERIMENTAL RESULTS ON DATASETS Data-set
VI.
In this paper, we have presented a novel approach to regression modeling where the model can be modified on the fly by addition of unique knowledge to suit the input stream of data. This proves to be an advantage over existing interactive learning approaches, where the model can be modified only at the beginning of the test process or after the test process has ended. We also demonstrate the utility of GIRAF framework by comparing the accuracy of an algorithm in GIRAF framework with the algorithm itself for varying amounts of data. It has been observed that a regression model with a comparable accuracy can be constructed in the GIRAF framework using comparatively smaller amounts of training data and thus reducing the effort of domain expert as less data has to be labeled for comparable performance. Future work includes development of a Grand Unified Model combining data mining paradigms like association rules, clustering, and classification to the existing regression framework. We also intend to include a visualization interface which will aid the user in selecting the required algorithms and the corresponding training and testing parameters.
[13]
[14]
[15]
Wolpert, David (1996) : The Lack of A Priori Distinctions between Learning Algorithms Neural Computation, pp. 1341-1390. Yan Zhao, Yiyu Yao, and Mingwu Yan: ICS: An Interactive Classification system Proceedings of the Canadian AI ,LNAI 4509, 34145, 2007 Qijun Chen, Xindong Wu, Xingquan Zhu: OIDM: Online Interactive data mining Proceedings of IEA/AIE,66-76,2004 Yao, Y.Y. and Shi, Z.Z: User-centered Interactive data mining International Journal of Cognitive Informatics and Natural Intelligence, (IJCINI),58-72,2008 Mark Hall, Eibe Frank, Geoffrey Holmes, Bernhard Pfahringer, Peter Reutemann, Ian H.Witten: The WEKA Data Mining Software: An Update: SIGKDD Explorations(2009), Volume 11, Issue 1. Wang, J. (Ed.): Encyclopedia of Data Warehousing and Mining, 2nd edition, 1085-1090, Idea Group Inc., (2008). Sykes, Alan M. and Mayer, Alan D.: Dynamic graphic regression, SIGAPL APL Quote Quad Vol: 32, (2002) Hiroshi Mamitsuka, Naoki Abe : Efficient Data Mining by Active Learning, Japanese Discovery Science Project, 2002 David A. Cohn, Les E. Atlas, Richard E. Ladner: Improving Generalization with Active Learning, Machine Learning – 1994 David A. Cohn, Zoubin Ghahramani, Michael I. Jordan: Active Learning with Statistical Models, The Computing Research Repository, CORR,1996. Sun, Xiao Yan and Gong, Dunwei and Li, Subei: Classification and regression-based surrogate model-assisted interactive genetic algorithm with individual’s fuzzy fitness, GECCO ’09: Proceedings of the 11th Annual conference on Genetic and evolutionary computation (2009) Russell Beale and Andy Pryke: HAIKU:Interactive Comprehensible Data Mining Ambient Intelligence for Scientific Discovery, 2005 I.-C. Yeh.: Modeling of strength of high performance concrete using artificial neural networks, Cement and Concrete Research, 28, No. 12:1797-1808, 1998. Pace, R. Kelley, and Ronald Barry: Quick Computation of Regressions with a Spatially Autoregressive Dependent Variable, Geographical Analysis, Volume 29, Number 3, July 1997, p. 232247. Aditya Desai, Himanshu singh, Vikram Pudi. PAGER: Parameterless, Accurate, Generic, Efficient kNN-based Regression.
5
ERTAI-2010
GIRAF : Generic, Interactive, Regression Analysis Framework
Technical Report IIIT/TR/2009/157, IIIT Hyderabad, http://web2py.iiit.ac.in/publications/default/ view publication/techreport/59, October 2009.
[16] A. Asuncion and D. Newman. UCI machine learning repository, 2007. [17] http://lib.stat.cmu.edu/datasets/.
Figure 2: Number of tuples of refine data considered v/s Number of expert queries
6
ERTAI-2010
Integration of CAD/CAM using Intelligent Process Planning
Integration of CAD/CAM using Intelligent Process Planning Deepali Tatkar
Venkatesh Kamat
Software Department New Vision Online Services Pvt. Ltd Mapusa, Goa, India [email protected]
Department of Computer Science and Technology Goa University Goa, India [email protected]
Abstract— Process planning translates design information into the process steps and instructions to efficiently and effectively manufacture a product. Computer-Aided Process Planning (CAPP) bridges the gap between Computer Aided Design (CAD) and Computer Aided Manufacturing (CAM). CAPP integrates the design representation of CAD systems with the manufacturing process representation of CAM systems. In this paper we describe how we have implemented an Automated Process Planning System that automatically generates a process plan based on the design information, available resources and expert’s knowledge. The system is built by creating knowledge base that contains manufacturing knowledge specific rules. The rules cover the whole manufacturing process from raw material selection to finishing operations. The inference engine then selects the appropriate rules and executes them in order to generate a process plan. Keywords - Automated Process Planning, Reasoning Rules, Knowledge base, Reasoning Engine I.
INTRODUCTION
Planning & scheduling are the two most important activities involved in any manufacturing organization. Although the terms planning and scheduling are often used together they are two different activities with different objectives. Planning is concerned with what to do and how to do it where as scheduling is concerned with when to do it and who will do it. Identifying set of actions to be performed and the resources required to perform them is part of planning activity. Scheduling on the other hand is concerned with identifying a schedule from a large number of available alternatives. In some cases, it is difficult to decompose planning and scheduling cleanly and in that case we consider them together as a single activity. The objective of Computer-Aided Process Planning (CAPP) system is to bridge the gap between Computer Aided Design (CAD) system and Computer Aided Manufacturing (CAM) System. In this paper we discuss an implementation of CAPP wherein the design of a mechanical part is given in STEP file format and the objective is to automatically generate a process plan for manufacturing it. The automation of process plan involves making inferences based on the decision rules available in the knowledge base. These decisions rules codify manufacturing expert’s knowledge in the form of production rules. The automation of manufacturing process plan from a product design can be seen as a two-stage process. The first stage deals with recognition of the manufacturing features from CAD product description. The recognized
manufacturing features serve as the input to the second stage. Here the task is to generate manufacturing process plan by removing manufacturing features from the workpiece block model with certain sequence and accuracy so that a product model is approached. In short, process planning translates design information provided by a design engineer into detailed work instructions to manufacture the part. A detailed work instruction in process planning includes: selection of appropriate machines, tools, machining processes, cutting tools, jigs and fixtures, determination of operation sequences, the generation of NC part programs etc. Hence, process planning involves selection, calculation, and documentation and this task could be very complex, time consuming and also requires great deal of data and experience. From the preceding discussion it is clear that process planning is an information intensive task which requires significant amount of time and experience. Automated Process Planning system attempts to automate the process planning task by modeling human kind of intelligence. This includes gathering of expert knowledge from individuals (employees, customers and suppliers) and making it available to the entire organization to create a culture of knowledge sharing. Also save time and cost in the transition between design and manufacture. Thus, system provides intelligent functions for automatic generation of Process Plans. It also provides an XML based Data Exchange interface to integrate CAD/CAM system with other life cycle issues such as PDM, ERP etc. The remainder of this paper is organized as follows. Section II gives process planning overview. Section III surveys prior work. Section IV describes in brief design of Automated Process Planning System developed and Section V concludes the paper. II.
PROCESS PLANNING OVERVIEW
In a traditional manufacturing environment, a process plan is generated by a process planner, who examines a part drawing to develop an efficient and feasible process plan to produce part economically. This manual approach of process planning depends heavily on the knowledge and experience of the process planner to develop accurate, feasible and consistent process plans. In order to prepare new process plan, the process Deepali Tatkar is currently pursuing a Ph.D. in the Dept of Computer Science and Technology at Goa Unversity
1
ERTAI-2010
planner must be able to manage and retrieve a great deal of data and documents to identify a process plan for a similar part and make necessary modifications to the plan to produce the plan for the new part. The CAD/CAM integration has several benefits, first the automation of process planning directly following design stage and this result in consistent and accurate production plans. Second, integration reduces the workload on production planners and consequently decreases the planning cost and time. Third, it provides faster responses to change in product design and/or in shop floor status. Fourth, CAPP systems enable the firms to transfer a new product from concept into manufacturing in a short time. All these benefits have a substantial impact on overall productivity of the manufacturing company. There exist two approaches to the design of CAPP systems [1]. They are the variant and generative frameworks. In variant CAPP approach, a process plan for a new part is created by recalling, identifying and retrieving an existing plan for a similar part and making necessary modifications for the new part. Sometimes, the process plans are developed for parts representing a family of parts called 'master parts'. The similarities in design attributes and manufacturing methods are exploited for the purpose of formation of part families. A number of methods have been developed for part family formation using coding and classification schemes of group technology (GT), similarity-coefficient based algorithms and mathematical programming models. The variant process planning approach can be realized as a four step process; 1. Definition of coding scheme, 2. Grouping parts into part families, 3.Development of a standard process plan, 4.Retrieval and modification of standard process plan. A number of variant process planning schemes have been developed and are in use. The next stage of evolution is towards generative CAPP. In the generative CAPP, process plans are generated by means of decision logic, formulas, technology algorithms and geometry based data to perform uniquely many processing decisions for converting part from raw material to finished state. There are two major components of generative CAPP; geometry based coding scheme and process knowledge in form of decision logic data. The geometry based coding scheme defines all geometric features for process related surfaces together with feature dimensions, locations, tolerances and the surface finish desired on the features. The level of detail is much greater in a generative system than a variant system. For example, details such as rough and finished states of the parts and process capability of machine tools to transform these parts to the desired states are provided. Process knowledge in the form of decision logic and data matches to the part geometry requirements and the manufacturing capabilities using knowledge base. It includes selection of processes, machine tools, jigs or fixtures, tools, inspection equipments and sequencing operations. Development of manufacturing knowledge base is backbone of generative CAPP. The tools that are widely used in development of this database are flowcharts, decision tables, decision trees, iterative algorithms, concept of unit machined surfaces, pattern recognition techniques and artificial intelligence techniques such as
Integration of CAD/CAM using Intelligent Process Planning
expert system shells. In this paper we have described implementation artificial intelligence techniques in generative process planning framework. III.
PRIOR WORK
Use of Artificial Intelligence (AI) in process planning can be seen as early as 1980 in process planning system GARI [2] developed at the University of Grenoble in France. The system uses a set of reasoning rules as representation of its knowledge base. A part is represented to the process planning module in terms of a set of features like holes, notches etc. which includes geometrical and other manufacturing related information. The system provides the capability of backtracking from any of the intermediate stages of the process planning development to provide necessary revisions. It assigns weights to different pieces of advice at each stage of the process planning development, to resolve any conflicts. Techno structure of Machining (TOM) [3] is a rule-based expert system developed at the University of Tokyo. TOM uses Reasoning rules as its knowledge representation scheme about machining operations, sequencing and geometry of a part. It employs a backtracking search mechanism to generate a process plan. Hierarchical and Intelligent Manufacturing Automated Process Planning (HI-MAPP) [4] is another AI based process planning system developed by the University of Tokyo. In this system the knowledge base consists of 45 reasoning rules that are classified into 4 categories. There are rules that define process, type of cut, type of machine and rules that can select any other miscellaneous action that the planner wants. HIMAPP then applies the hierarchical and nonlinear planning concepts. ComputerIntegrated Process Planning and Scheduling (CLIPPS) [5] system consists of integrated modules namely for automated feature recognition, for determining an efficient and feasible process plan and to generate production schedule plan. The system provides feedback to design and manufacturing engineers to fully evaluate design and ensure that the product can be manufactured in a cost effective manner. A recent article on the subject of process planning and scheduling [6] gives a good review on integration of rule based process selection. Most of the CAPP systems reported in the literature are developed in Universities and research labs or are proprietary and designed in-house to solve specific process planning problems. IV.
DESIGN OF AUTOMATED PLANNING SYSTEM
The goal of Automated Planning System is to automatically generate a process plan according to design information and manufacturing knowledge available in an enterprise. Its focus however, is on improving work efficiency and quality by Integration, Intelligence and Information Management. The general architecture for feature based Automated Planning System is shown in Fig.1.The Feature Recognizer module as shown in Fig.2 extracts and translates the design information given in the CAD model into manufacturing information. Input to the Feature Recognizer consists of a STEP file containing CAD
2
ERTAI-2010
Integration of CAD/CAM using Intelligent Process Planning
models generated by some external solid modelers. The list of manufacturing features presently recognized by the
Automatic Process Planning (Reasoning Engine)
Feature Recognizer
Reasoning rule Knowledge Base module
Report
Resource information Resource database module
Figure 1. Architecture of Automated Process Planning System
Feature Recognizer includes: Cylindrical Holes/Solids, Chamfers, Fillets & Edge Rounds, Slots & Steps, Pockets, Cones, Tapers & Spheres, Ribs, Threads, Grooves, Bosses, Irregulars and Voids. On completion of the feature recognition task, the Feature Recognizer outputs all the recognized features in a XML file along with their feature parameters as shown in Fig.3. Automatic Process Planning Module contains manufacturing Knowledge base specified in term of Reasoning rules. Each Reasoning rule contains detailed set of instruction as to what resources are required to manufacture each feature. Process Planning (Reasoning Engine) module takes in XML file containing recognized features as input, refers it to the Knowledge base and tries to find out which of the Reasoning rules can be satisfied. Knowledge Base module typically manages company specific manufacturing knowledge and experiences. The reasoning rules cover the whole manufacturing process from raw material selection to finishing manufacturing. Resource database module manages all manufacturing resources, including machine tools, cutting tools, measuring tools etc. The output is a detailed process plan shown in Fig. 5 that typically contains setup operation like how to place and clamp the work piece, nominal machining process operations like drilling, milling with process details such as tool diameter, feed rate, number of passes, and finally finishing operation (if needed to improve the tolerances) along with its process details. The Reasoning rule specified in Knowledge base contains detailed set of instruction as to what resources are required to manufacture each feature. These rules are compiled by the systems personnel by interacting with manufacturing expert. A. Reasoning Rules And Knowledgebase System divides manufacturing process into six phases. Each phase involves a decision making rules which we call Reasoning rule. The user can define rules according to their own experience and practical situation. Each rule can be represented in the form of if then . The structure of the Reasoning rules is defined well in separate module called Knowledgebase as shown in Fig.4.
Figure 2. Recognized Feature with non interacting solid and corresponding feature tree with manufacturing information
The user just needs to enter the relevant parameters so as to generate a new rule. Six distinct set of rules identified are: Selection of material, Selection of manufacturing method, Selection of machine tool, Selection of fixture, Selection of sequence of operation and Selection of cutting tool. Each entity in the selection process is described with attributes. Selection of a particular entity is based on the value of its attribute. For instance, raw material is described by its dimension and material strength. Manufacturing method is decided by feature type - is it a hole or a slot? Different methods will be selected for different feature types along with their parameters and attributes. For example, the parameters of hole include radius, depth, point angle, tolerance, surface roughness, material, and its hardness etc. The machine tool selection depends on the type of operation - is it rough turning, finished turning or drilling? The fixture selection mainly depends on raw material part shape. For example,” Bolt”, “Tube” and “Block”. The sequence selection depends on the order in which sequence of operation need to be carried out, machine tool optimization - shortest time, cost, etc. For instance, the rough turning should be performed before finished turning etc. In short, Knowledge base consists of logical rules which depict relationships or constraints among different entities involved in manufacturing. Each rule is also associated with a priority. Priority defines the order in which rules will be get selected while generating the process plan. The kind of relationships specified in the knowledge base depends on the understanding and experience of the manufacturing expert. We have provided an interface so that new rules can be inserted and old rules can be deleted or edited. The rules in Knowledge Base may directly influence the final process planning. It means that the quality of these rules will directly affect the results of the process planning. The structure of the reasoning rules is defined in this module and the user just needs to enter the relevant parameters so as to generate a new rule. These rules defined by the user are also easy to modify and change in order to fit to the actual situation.
3
ERTAI-2010
Integration of CAD/CAM using Intelligent Process Planning
Figure 3. Output XML file with manufacturing feature information
Figure 5. Output Process Plan Reason step by step Generative method
Generating Process planning methods
Figure 4. Structure of Reasoning Rules in Knowledgebase
The Resource Database is used to organize and manage the manufacturing resources or the manufacturing phase. Resource base is actually a fact sheet that contains all the material resources available in the enterprise. The structure of the manufacturing resources is defined in the Resource Database module and the user needs only to input the values in the resource record. B. Intelligent Reasoning Engine The Automatic Process Planning System works based on generative process planning method. Generative method of process planning has two ways of generating process plan, i.e. reason step by step and reason continuously. Both approaches use intelligent reasoning based on user-defined rule. It automatically develops a process plan depending on the knowledge base, resource database and Feature Information from Feature Recognizer. Automatic Process Planning module calls Reasoning rule from knowledge base and resource information from resource database to generate process planning report. The main heart of system is called Reasoning Engine which is designed for this special role. The reasoning engine uses the inference mechanism to automatically generate optimal Process plan by applying the Reasoning rules. Loading manufacturing feature information is the first step towards generating a process plan report. Manufacturing feature information is stored in XML file, which is the output of Feature Recognizer
Manual method
Reason continously method
Template method
Figure 6. Methods to generate a Process Plan
Module. The job of the reasoning engine is to tell which rules are being satisfied at any given point in time. Apart from generative process planning method there are two other approaches to generate a process plan, namely, Template Method, and Manual Method as shown in Fig.6. Generative Method automatically builds a process plan using the knowledge saved in the knowledge base and the resources saved in the Resource Database. Manual Method allows building the process plan manually without Knowledge base and Resource database. Template Process Planning is used for generating a process plan report, which has similar characteristics as a saved template. Automatic Process Planning Module provides the process plan edit operation such as insert, delete, modify, move, copy, paste and save. V.
CONCLUSIONS
The paper describes automatic process planning technique that allows manufacturing expert to incrementally specify rules in Knowledgebase and gradually enhance degree of automation. It is observed that the rules in Knowledgebase directly influence the quality of process plan. It is envisaged that over a period of time, with trial and error the system will stabilize and dependence on expert process planner will decrease. One of the practical applications of our system is to estimate the cost of manufacturing the product in design stage itself. 4
ERTAI-2010
Integration of CAD/CAM using Intelligent Process Planning
ACKNOWLEDGMENT This work was carried out at New Vision, Tivim Industrial Estate, Mapusa Goa, India on behalf of Xplano AS Trondheim, Norway where the second author was involved as a consultant. We would like to thank the management of New Vision and Xplano, especially Dr. Ketil Bo, Dr. Mohsen Pourjavad and Bob Lloyd for their valuable cooperation and support in promoting this research.
[2]
[3]
[4]
[5]
REFERENCES [1]
Serope Kalpakjian and Steven Schmid “Manufacturing Engineering and Technology, 4th Ed. Pearson Education Asia.
[6]
Yannick DESCOTTE and Jean-Claude LATOMBE “GARI: A Problem Solver that Plans How to Machine Mechanical Parts” VOL-2 pages 766-772. K. Matsushima, N. Okada and T. Sata, “The integration of CAD and CAM by application of artificial intelligence”, Annals of CIRP 31, 1(1982). H. R. Berenji and B. Khoshnevis, “Use of artifial intelligence in automated process planning”, Computer in Mechanical Engineering (1986) 45-55 Khalid Aldakhilallah and R. Ramesh, “Computer Technique and Applications of Automated Process Planning in Manufacturing Systems, Computer Aided and Integrated Manufacturing Systems Vol.2, 135-158 D. N. Sormaz, J. Arumugam and C. Ganduri, “Integration of rulebased process selection with virtual machining for distributed manufacturing planning” 61-89, Process Planning and Scheduling for Distributed Manufacturing, Springer Verlag, 2007
5
ERTAI-2010
Mining Landmark Papers
Mining Landmark Papers Annu Tuli Center for Data Engineering IIIT-H Hyderabad, India [email protected]
Abstract—In recent years, the number of electronic journal articles is growing faster than ever before; information is generated faster than people can deal with it. In order to handle this problem, many electronic periodical databases have proposed keyword search methods to decrease the effort and time spent by users in searching the journal's archives. However, the users still have to deal with a huge number of search results. In this paper, we present the problem of mining landmark papers. We treat papers that introduce important keyphrases for the first time as landmark papers. Our approach combines simple ideas from text mining, information extraction and information retrieval to identify landmark papers. We show that existing related techniques such as first story detection, mining hot topics and theme mining do not effectively handle the landmark paper mining problem. Our approach is simpler and more direct for this task. We experimentally evaluate our approach on a large dataset of papers in the database or data mining areas downloaded using DBLP. Keywords-Landmark papers, Text Mining, Information Extraction and Retrieval, First-Story Detection.
I.
INTRODUCTION
Text Data Mining (TDM) can be considered a field of its own, containing a number of applications. It has also been known as text analysis, text mining or knowledge discovery in text. In general, TDM applications are used to extract non-trivial and useful information from large corpora of text data, which are available in unstructured or structured format. Text mining applications require the use and application of many related fields such as Information Retrieval, Machine Learning, Statistics, and Linguistics. There are various applications of TDM, such as in bioinformatics, market research, consumer trend studies, and scientific research [1]. The internet today is going through a rapid phase of growth and development. With the growth of the internet, information contained in electronic documents is increasingly widespread, with the World Wide Web as its primary repository. The convenience of electronic documents has motivated their more efficient application in information management and knowledge discovery [2]. In many application domains, we encounter a stream of text, in which each text document has some meaningful time stamp [3]. For examples, a collection of new articles about a topic and research papers in a subject area can both be viewed as natural text streams with publication dates as time stamps. In such text data streams, there often exist some interesting and meaningful keywords. For example, an event covered in news articles generally has some meaningful keywords consisting of themes (i.e., subtopics) characterizing the beginning, progression, and impact of
Vikram Pudi Center for Data Engineering IIIT-H Hyderabad, India [email protected]
the event, among others. Similarly, in research papers, some important and meaningful keywords may also exhibit similar patterns. For example, the study of one topic specified by some keyphrases in some time period may have influenced or stimulated the study of another topic associated with same keyphrases after the time period. In all these cases, it would be very useful if we can discover and extract these important keyphrases and also identify the first corresponding paper automatically from text to get knowledge about the keyphrases from where they originate with respect to time stamp. Indeed, such research papers are not only are useful by themselves, but also would facilitate organization and navigation of the information stream according to the underlying keywords. Consider, for example, there are often hundreds of research papers published annually in a research area. A researcher, especially a beginning researcher, often wants to understand how the research topics in the literature have been evolving. For example, if a researcher wants to know about data mining, both the historical milestones and the recent research trends of data mining would be valuable for him/her. Identifying the origins of important and new keyphrases will also make it much easier for the researcher to selectively choose appropriate new field of research. Also, the corresponding first document (i.e. landmark paper) for that keyphrase will also help the researcher to read only those papers based on his/her research interests. Mining landmark papers is not only useful for the beginning researcher, but for anyone keeping track of important developments in a particular area. This is important today due to the large numbers of researchers and published research papers. Keeping track of landmark papers is especially useful to track key research developments not necessarily in the specific area of a researcher, but in its numerous related areas, which tends to be voluminous. This paper is organized as follows: In Section 2, we introduce the related work. In Section 3, we first describe our problem definition and then in section 4 we describe methodology of our work. Detailed experimental results are presented in Section 5. In Section 6, we draw conclusions and present future work. II.
RELATED WORK
Information Retrieval (IR) and Information Extraction (IE) areas are associated with text mining. IE has the goal of transforming a collection of documents into information that is more readily digested and analyzed with the help of an IR system. IE extracts relevant facts from the documents, while IR selects relevant documents. IE is a kind of pre-processing stage in the text mining process,
1
ERTAI-2010
which is the step after the IR process and before data mining techniques are performed. A typical information retrieval problem is to locate relevant documents in a document collection based on a user's query, which is often some keywords describing an information need, although it could also be an example relevant document. In such a search problem, a user takes the initiative to ``pull'' the relevant information out from the collection; this is most appropriate when a user has some ad hoc (i.e. short-term) information need, such as finding information to buy a used car. When a user has a long-term information need (e.g. a researcher's interests), a retrieval system may also take the initiative to ``push'' any newly arrived information item to a user if the item is judged as being relevant to the user's information need. Given the avalanche of electronic documents, the pervasive use of search engines helps to minimize the time required to extract information [2]. In the most popular form of search, the search criteria are keywords, or concepts that may be contained in the electronic documents [4]. However, the users still have to deal with the overabundance of search results in some way. During the last decade, the question of how best to filter the results of search engines has become an important issue. Topic detection is an experimental method for automatically organizing search results. It could help users save time in identifying useful information from large scale electronic documents. A topic is defined to be a seminal event or activity along with all directly related events and activities [5]. Today, many different data mining methods are employed to recognize topics, for instance, the naive bayes classifier [6], hierarchical clustering algorithms (HCA) [7][8][9], paragraph relationship maps [10], formal concept analysis (FCCA) [11] and lexicon chains [12][13][14]. These methods use the frequencies of words to calculate the similarity between two documents. Therefore, their accuracy is greatly hindered by the presence of synonyms. Halliday and Hasan [13] proposed a semantics-based lexical chain method that can be used to identify the central theme of a document. Based on the lexical chain method, combined with the electronic WordNet database, the proposed method clusters electronic documents by semantic similarity and extracts the important topics for each cluster. Ultimately, the method provides more userfriendly search results that are relevant to the topics. Shewhart and Wasson [17] described a process that monitors newsfeeds for topics that receive unexpectedly high amounts of coverage (i.e. hot topics) on a given day. They performed trend analysis in order to find hot topics, except that they used controlled vocabulary terms rather than phrases extracted from text. The purpose of the study is to monitor newsfeeds in order to identify when any topic from a predefined list of topics is a hot topic. In [1], Indro De, presented First Story Detection (FSD) whose task requires identifying those stories within a large set of data that discuss an event that has not already been reported in earlier stories. In this FSD approach, algorithms look for keywords in a news story and compare the story with earlier stories. FSD is defined as the process to find all stories within a corpus of text data that are the first stories describing a certain event [15]. An event is a topic that is described or reported in a number of stories.
Mining Landmark Papers
Examples can be governmental elections, natural disasters, sports events, etc. The First Story Detection process runs sequentially, looking at a time-stamped stream of stories and making the decision based on a comparison of key terms to previous stories. FSD is closely linked to the Topic Detection task, a process that builds clusters of stories that discuss the same topic area or event [16]. Comparable to this, FSD evaluates the corpus and finds stories that are discussing a new event. FSD is a more specialized version of Topic Detection, because in Topic Detection the system has to determine when a new topic is being discussed and the resulting stories will be the ``firststories''. In 2005, Mei and Zhai [3] discovered evolutionary theme patterns from text information collected over time. Temporal Text Mining (TTM) has many applications in multiple domains, such as summarizing events in news articles and revealing research trends in scientific literature. In this paper, TTM task is discovering and summarizing the evolutionary patterns of themes in a text stream. They define this new text mining problem and present general probabilistic methods for solving this problem through (1) discovering latent themes from text; (2) constructing an evolution graph of themes; (3) analyzing life cycles of themes. A. Differences from Landmark Paper Mining We now show that existing related techniques, specifically first story detection, hot topic mining and theme mining do not effectively handle the landmark paper mining problem. Our approach is simpler and more direct. We cannot reduce our requirements to the first story detection, hot topic mining and theme mining effectively. In first story detection (FSD) [1], algorithms look for keyword in a first news story and compare the story with earlier stories. FSD is the process to find all stories within a corpus of text data that are the first stories describing a certain event [15]. The FSD process runs sequentially looking at a time-stamped stream of stories and making the decision based on a comparison of key terms to previous stories. FSD is closely linked to the topic detection task [16], a process that builds clusters of stories that discuss the same topic area or event. Landmark paper mining differs significantly from FSD in the following ways. In FSD, a new story is detected as being a first story if it has a significant vocabulary shift from recent papers. First, a vocabulary shift could occur even without the introduction of new key terms if the frequencies of existing key terms are significantly altered. Second, a document can be flagged as a first story, even when there is an earlier document with the same key terms and frequencies. For example, even if there was an earthquake last year, the first story describing a more recent earth-quake will be detected as a first story. In hot topic mining [17], a topic is known as hot when it receives an unusually high amount of coverage in the news on a given day because of the importance of the events involving that topic. They used trend analysis in order to find hot topics, except that they are using controlled vocabulary terms rather than phrases extracted from text. Landmark paper mining is clearly a different problem as it seeks to mine interesting papers, instead of interesting topics.
2
ERTAI-2010
Mining Landmark Papers
Finally, the Temporal Text Mining (TTM) [3] task is to discover, extract and summarize the evolutionary patterns of themes in a text stream. In this paper, the authors identify when a theme starts, reaches its peak, and then deteriorates, as well as which subsequent themes it influences. A timeline based theme structure is a very informative summary of the event, which also facilitates navigation through themes. Theme Mining can be considered as an approach to mine interesting papers that originate themes. A new theme containing only existing keyphrases with altered frequencies does not have necessarily represented a new concept. In fact, this step (of determining themes) is both unnecessary and insufficient to determine if a paper originates a new concept or not. In contrast, a new keyphrase almost certainly indicates the presence of a new concept. However, in landmark paper mining, we follow a simpler and more direct approach. We identify papers that originate important keyphrases instead of themes (which can contain a collection of keyphraes). Our approach is simpler because it avoids the notion of themes - so there is no need to decide which collection of keyphrases form a theme. By avoiding this unnecessary step, our approach is more direct. III.
PROBLEM DEFINITION
In this paper we focus on the problem of finding documents (research papers) from a text corpus based on user specified keyphrases and also identifying the first document from the corpus where important keyphrases are introduced for the first time. We present the problem of mining landmark papers. This problem requires simultaneously understanding what keyphrases/topics are new or important and which documents drive these keyphrases. The following definition formally captures the problem statement: Landmark Paper Mining: Given a collection of time
indexed documents C = { d 1 , d 2 ,..., d T } , where d i refers to a document with stamp i ,each document is a sequence of words from a vocabulary set V = { w 1 , w 2 ,..., w v }, the problem is to identify the first document that introduces important keyphrases for the first time known as landmark papers. This can be broken into two sub-problems: 1) Find the right key phrases / topics in a collection of documents. 2) Identify the originating documents of important key phrases or which documents introduced new keyphrases that had large impact? Here, we used the notion of keyphrases instead of keywords to avoid meaningless terms. A keyphrase consist of the set of keywords. In our experiments, we set a minimum length of keyphrase is 2 and maximum is 3. IV.
METHODOLOGY
The methodology we describe is a general approach that can be applied to text corpuses of varying complexity. The results of the mining are a set of landmark documents that match a query supplied by the user. Our methodology has the following major steps:
a) Extract n keyphrases from each full text document by using Keyphrase Extraction Algorithm (KEA) [18]. In our experiments we set n = 10. b) Find the number of occurrences of each keyphrase within a document and within the corpus. c) For each keyphrase sort the documents containing it in increasing order of time-stamp, e.g. conference year. d) Remove keyphrases that are not contained in at least $minsup$ documents. In our experiments, we set minsup = 3. e) In sorted list of documents, the first document for a keyphrase is identified as a candidate landmark paper corresponding to that keyphrase. The above parameter minsup ensures that the keyphrases considered are persistent and thereby important. In addition to the above steps, we have the following additional pruning step to refine the results. f) Remove keyphrases that are contained in the References section of their corresponding candidate landmark papers. V.
EXPERIMENTAL EVALUATION
To evaluate the performance of our algorithm, we prepared a database of well tagged "Data Mining/Databases" research papers from DBLP website. DBLP (Digital Bibliograpy and Library Project) is a computer science bibliography website hosted at University of Trier in Germany. The web-site maintains information regarding research papers in various fields of computer science. The web-site currently has more than 2 20 research papers indexed. We experimentally evaluated our approach on a large data set of research papers in the databases and data mining areas. The research papers are classified on their topic, their conference, year of publication and their authors. We have extracted the information related to data mining/databases conferences like sigmod, vldb etc. and store the results in our database to perform the experiments. The information we extracted include the year of publication, authors, conference, paper title, general paper topic and we also got the link from where the user can download the full-text pdf files of research papers. We have converted full-text pdf files into text files by the pdf2text software in Linux. The basis statistics of the data sets are shown in table I. We intentionally did not perform stemming or stop word pruning in order to test the robustness of our algorithm. Methodology described in section IV is performed on whole dataset. After performing these steps on whole dataset, we showed our results by using occurrence of keypharses in the references of corresponding document. If any of the keyphases present in the references of this document, implies the keyphrase introduced earlier and corresponding document will not be a landmark paper. If it is not present any of the references implies keypharse
3
ERTAI-2010
Mining Landmark Papers
introduce first time in that document and recommend that document as a landmark paper. TABLE I. BASIC INFORMATION ABOUT DATASETS Conferences
# of docs
TABLE III. LIST OF PAPERS BY MANUAL CHECKING
Keyphrases
Time range
VLDB
420
2005-2007
SIGMOD
352
2005-2007
ICDM
373
2005-2007
KDD
331
2005-2007
Load shedding
Total
1476
3
Data publishing
The results are shown in table II, where first column represents keyphrases, second column represents corresponding document from sorted list of documents and third column represents landmark paper by reference checking and says Yes, if the keyphrase is not in the reference of document else No, if it is present in the reference.
Data Management RFID data
Schema mapping BPEL specification Access control Twig query Partial answers Graph database
TABLE II. LIST OF PAPERS BY CHECKING REFERENCES Road network Keyphrases
First d iiit.ac.in ocument from sorted list
Landmark paper by references checking
Data Management
05_sigmod _16_ Paper Title
No
RFID data
05_vldb _97_ Paper Title
Yes
Load shedding
05_vldb _122_ Paper Title
Data publishing
05_sigmod _82_ Paper Title
Yes
Schema mapping
06_vldb _111_ Paper Title
No
BPEL specification
06_vldb _31_ Paper Title
No
Access control
05_sigmod _27_ Paper Title
No
Twig query
05_vldb _32_ Paper Title
No
Partial answers
06_vldb _20_ Paper Title
Yes
Graph database
05_sigmod _53_ Paper Title
Yes
Road network
05_vldb _73_ Paper Title
Yes
Data clustering
05_kdd _73_ Paper Title
No
Quasi-identifier attributes
05_sigmod _112_ Paper Title
Yes
No
We validated our results by checking manually in the documents. We took a random sample of approximate 50 keyphrases and checked manually in corresponding documents whether these keyphrases introduced first time or not. If keyphrase introduced first time in a document and have some meaningful significance and also not present in references, then we are keeping those documents as a landmark papers. . Out of this 50 random sample, we took 13 random sample of keyphrases and corresponding documents as a representative to show our results. In table III, there are 3 columns, where first column represents keypharses, second column represents corresponding document from sorted list of documents and third column
Data clustering Quasi-identifier attributes
First document from sorted list 05_sigmod _16_ Paper Title 05_vldb _97_ Paper Title 05_vldb _122_ Paper Title 05_sigmod _82_ Paper Title 06_vldb _111_ Paper Title 06_vldb _31_ Paper Title 05_sigmod _27_ Paper Title 05_vldb _32_ Paper Title 06_vldb _20_ Paper Title 05_sigmod _53_ Paper Title 05_vldb _73_ Paper Title 05_kdd _73_ Paper Title 05_sigmod _112_ Paper Title
Landmar k paper by Manual checking No Yes Yes Yes No Yes No Yes No Yes No No Yes
represents landmark papers by manual checking and says Yes, if a keyphrase is present first time in a document also not in the references of and No, if it is not. In table V, we have combined the results from both table II and III. In this table, first and second columns indicate keypharses and corresponding first document from sorted list of the documents, and, third and fourth column represents results from references checking and manual checking and finally last column represents landmark papers and show Yes, if a document is satisfying above criteria, else leave “-“ in that column. Also, corresponding keyphrase is important and new to the user. The physical significance of first document for eg: "05_vldb_97_Paper Title" from sorted list, implies that this is the first document where keyphrase "RFID data" introduced first time in our database. As shown in table I, we took 1476 number of documents for vldb, sigmod etc. conferences with time stamp 3 years, where "05_vldb$_97_Paper Title" represents first document (Landmark Paper) where the keyphrase is new and introduced first time. The output of our algorithm is not limited and it will be more accurate if we will run our algorithm on large datasets. Also, for the keyphrase "twig query" we have seen that it occurs only on vldb 2005 and 2006 year research papers not in the other conferences and in a given time stamp in our database. This implies that the keyphrase is not used by after this conference and years. So, it will be helpful for new researchers or user, to choose this keyphrase as an important keyphrase to continue his work in this area. Also, some of the keyphrases occur in recent years for eg: 2007, not before in our database, give intuition to user or researcher that this is the important or new keyphrase which introduced very recently in the
4
ERTAI-2010
Mining Landmark Papers
coming years. For keyphrase "graph database" we analyze that this keyphrase occurred in our whole database very frequently implies the keyphrase has equal influence throughout the time stamp window and it may occurs in coming years implies that the keyphrase is neither new nor important for a new user. In addition to this, for the keyphrases which are new and important for a user, we are also providing corresponding first document to the user, from where it originates. This will provide help to the new researcher or user to read the documents to get the overview of the keyphrases from where it introduced. To check the accuracy of our results, we use common measurements in information retrieval that are recall and precision. Precision (P) is the percentage of retrieved documents that are in fact relevant to the query (i.e., "correct" responses), and Recall (R), is the percentage of documents that are relevant to the query and were, in fact, retrieved.
classified and error rate identify how many are misclassified. The accuracy and error rate is given by:
Accuracy = And,
error − rate = 1 − Accuracy
(1)
And
recall =
tp tp + fn
(2)
Where, tp represents true positive, tn is true negative, fp is false positive and fn is false negative respectively. An information retrieval system often needs to trade off recall for precision or vice-versa. One commonly used trade-off is the F-score, which is defined as the harmonic mean of recall and precision:
F − score =
2∗ R∗ P P+ R
(3)
We also show the confusion matrix, accuracy rate and error rate for our experimental results. The confusion matrix is a useful tool for analyzing how well our predicted results can match to actual results. A confusion matrix for predicted class and actual class is shown in table V. TABLE IV. A CONFUSION MATRIX FOR POSITIVE & NEGATIVE TUPLES
5 (tp)
2(fp)
3(fn)
3(tn)
From table IV, calculated precision is 0.714, recall is 0.625 and F-score is 0.667. We also estimated accuracy and error rate for our experiments. The accuracy of any system is the percentage of test set tuples that are correctly
CONCLUSION AND FUTURE WORK
In this paper we have identified those documents within a large set of data that discuss a keyphrase that has not already been reported in earlier documents. In this approach, algorithms look for keyphrases in a corpus of research papers and compare the keyphrase with earlier papers. For a given keyphrase, we have identified a document from the text documents collected over time with the condition is, that keyphrase should occur first time at this paper and also have some meaningful definition in this paper. In addition of this, we also identified the presence of these keyphrase in the references (e.g. Title, Conference name etc.) of the document, if it is not present any of the references of a particular document means it is not introduced before this document and before this time stamp. So, that keyphrase is new to that document and introduced first time at this document with some meaningful definition, will known as a landmark paper. Our approach combines simple ideas from text mining, information extraction and retrieval. In future, we are trying to run our algorithm on large database of conferences and time stamps .In addition to this, we can extend of this work to identify the landmark keyphrases by using evolutionary graph of important keyphrases and also we can extend this work to find the impact of the document based on frequent keyphrases. REFERENCES [1]
[2]
Predicted Results Actual Results
(5)
In our experiments, calculated accuracy is 0.62 and error rate is 0.38. This implies that our algorithm is reasonably accurate. On the other hand it is easy, direct and simple to understand. VI.
tp precision = tp + fp
tp + tn tp + tn + fp + fn (4)
[3]
[4]
Indro De, “Experiments in First Story Detection,” Proceedings of The National Conference on Undergraduate Research (NCUR), Virginia, , April 2005, pp. 21—23. H. Wang, T. Huang, J. Guo and c. Li, “Journal Article Topic Detection Based on Semantic Features,” IEA/AIE '09: Proceedings of the 22nd International Conference on Industrial, Engineering and Other Applications of Applied Intelligent Systems, Berlin, Heidelberg,, 2009, pp. 644—652. Q. Mei and C. Zhai, “Discovering Evolutionary Theme Patterns from Text: An Exploration of Temporal Text Mining,” KDD '05: Proceedings of the eleventh ACM SIGKDD international conference on Knowledge discovery in data mining, Chicago, Illinois, USA, , {2005}.198—207. E. M. Voorhees, “Natural Language Processing and Information Retrieval,” Information Extraction: Towards Scalable, Adaptable Systems, Springer, 1999, pp. 32—48.
5
ERTAI-2010
Mining Landmark Papers
[5]
J. Allan, “Topic detection and tracking: event-based information organization,” Kluwer Academic Publishers, Dordrecht, {2002}. [6] M. Lee, W. Wang and H. Yu, “Exploring supervised and unsupervised methods to detect topics in biomedical text,” BMC Bioinformatics, vol. 7, 2006. [7] P Berkhin, “Survey Of Clustering Data Mining Techniques,” Accrue Software. Inc. {2002}. [8] A. Hotho, A. Nurnberger, and G. Paass, “A brief Survey of Text Mining,LDV Forum - GLDV Journal for Computational Linguistics and Language Technology, May, 2005. [9] M. Kantardzic, “Data Mining,” Wiley Inter-Science, Hoboken, 2003. [10] G. Salton, and J. Michael, “Introduction to Modern Information Retrieval,” McGraw-Hill, Inc.,New York, 1983. [11] R. Wille, “Restructuring lattice theory: an approach based on hierarchies of concepts,” In: Rival, I. (ed.): Ordered Sets, Reidel, Dordrecht, 1982, pp. 445—470. [12] R. Barzilay, and M. Elhadad, “Using Lexical Chains for Text Summarization,” In ACL/EACL Workshop on Intelligent Scalable Text Summarization, 1997.
[13] M. A. K Halliday, and R. Hasan, “Cohesion in English (English Language), Longman, London, May, 1976. [14] P. Hatch, N. Stokes, and J. Carthy, “Topic Detection, a new application for lexical chaining,” In the Proceedings of the 22 nd BCS IRSG Colloquium on Information Retrieval, 2000, pp. 94103. [15] The Linguistic Data Consortium, The Year 2000 Topic Detection and Tracking TDT2000 Task Definition and Evaluation Plan, version 1.4, August , 2000. [16] National Institue of Standards and Technology, http://www.nist.gov/speech/tests/tdt/ [17] M. Shewhart, and M. Wasson, “Monitoring a newsfeed for hot topics,” KDD '99: Proceedings of the fifth ACM SIGKDD international conference on Knowledge discovery and data mining, 1999, pp. 402—404. [18] I. H. Witten, G. W. Paynter, E. Frank, C. Gutwin and C. G NevillManning, “KEA: Practical Automatic Keyphrase Extraction,” Proc. DL '99, 1999, pp. 254-256.
TABLE V. LIST OF PAPERS BY CHECKING REFERENCES
Keyphrases
Data Management RFID data Load shedding Data publishing Schema mapping BPEL specification Access control Twig query Partial answers Graph database Road network Data clustering Quasi-identifier attributes
First document from sorted list 05_sigmod _16_ Paper Title 05_vldb _97_ Paper Title 05_vldb _122_ Paper Title 05_sigmod _82_ Paper Title 06_vldb _111_ Paper Title 06_vldb _31_ Paper Title 05_sigmod _27_ Paper Title 05_vldb _32_ Paper Title 06_vldb _20_ Paper Title 05_sigmod _53_ Paper Title 05_vldb _73_ Paper Title 05_kdd _73_ Paper Title 05_sigmod _112_ Paper Title
Landmark paper by references checking
Landmark paper by manual checking
Landmark paper
No
No
-
Yes
Yes
Yes
No
Yes
-
Yes
Yes
-
No
No
Yes
No
Yes
-
No
No
-
No
Yes
Yes
Yes
No
-
Yes
Yes
Yes
Yes
No
-
No
No
-
Yes
Yes
Yes
6
ERTAI-2010
Demarcation of Utterance Boundaries in Speech Signals Using Mahalanobis Distance Function as a Sample Clustering Technique
1
Demarcation of Utterance Boundaries in Speech Signals Using Mahalanobis Distance Function as a Sample Clustering Technique Srinivasa Rao Mutcha Applied Artificial Intelligence Group C-DAC Pune, India. e-mail: [email protected]
Abstract- One of the most challenging problems in speech processing is demarcation of utterances from background noise. This problem is often referred to as the end-point alignment problem. The accurate detection of a word's start and end points means that subsequent processing of the data can be kept to a minimum. Consider the speech recognition technique based on template matching. The exact timing of an utterance will generally not be the same as that of the template. They will also have different durations. In many cases the accuracy of alignment depends on the accuracy of the endpoint detections. Also the acoustic properties of different sounds (phonemes) pose the challenge of robustness so that the algorithm must be able to cater to all the acoustical variations possible in human speech. Most methods proposed over the years use Short-Time Energy (STE) based analysis involving empirical measures of the speech signal such as the Zero Crossing Rate and Energy Thresholds, in order to estimate whether the signal region under consideration belongs to speech utterance or silence/background noise. In this paper we proposed the implementation of Utterance demarcation based on Mahalanobis Distance Function. Keywords: Utterance Demarcation, End-point alignment, Mahalanobis Distance Function, Normal Distribution, 3-1 rule.
I. INTRODUCTION The process of Speech and/or Speaker Recognition essentially involves the signal preprocessing stage, tasked with efficient and robust extraction of acoustic properties from the speech signal. One of the facets of Feature Extraction process is demarcation of utterances from background noise. This problem is often referred to as the end-point alignment/detection problem. In the conventional endpoint detection algorithms, the shorttime energy or spectral energy is usually employed as the primary feature parameters with the augmentation of zero crossing rate, pitch and duration information. But features thus obtained become less reliable in the presence of nonstationary noise and various types of sound artifacts. In general, such algorithms involve empirical estimation of energy thresholds and the basic premise is that energy content in voiced region of speech is greater than silence/unvoiced region. However use of such ad hoc threshold values usually means that any change in the
Rahul Sharma Applied Artificial Intelligence Group C-DAC Pune, India e-mail: [email protected]
ambient noise levels degrades accuracy. Another method involves computation of Zero-Crossings Rate of the Speech Signal and the estimation of a Zero-crossing threshold (ZCT). A combination of these two methods has been shown to yield better results than when either of these is employed individually. In this paper we discuss the implementation of an endpoint detection algorithm employing uni-dimensional Mahalanobis-Distance function governed by 3-1 rule, as a sample clustering mechanism to determine whether particular frames of speech signal belong to silence or to voiced regions. The organization of the paper is as follows: Section 2 contains a brief introduction to Mahalanobis Distance Function. Section 3 describes the end-point segmentation algorithm. Section 4 presents the inferences drawn from the exercise. II. NORMAL DISTRIBUTION AND MAHALANOBIS DISTANCE FUNCTION For most practical purposes the background/ ambient noise present in the recorded speech signals may be treated as Gaussian noise. Gaussian noise is statistical noise that has a probability density function (pdf) of the normal distribution (also known as Gaussian distribution). A. Normal Distribution In probability theory and statistics, the normal distribution also known as the Gaussian distribution is a continuous probability distribution that often gives a good description of data that cluster around the mean. The graph of the associated probability density function is bell-shaped, with a peak at the mean, and is known as the Gaussian function or bell curve. The normal distribution is often used to describe, at least approximately, any variable that tends to cluster around the mean. The simplest case of a normal distribution is known as the standard normal distribution, described by the probability density function
(1)
1
1
ERTAI-2010
Demarcation of Utterance Boundaries in Speech Signals Using Mahalanobis Distance Function as a Sample Clustering Technique
1 The constant in this expression ensures that the total area under the curve 2 (x) is equal to one and 1⁄2 in the exponent makes the “width” of the curve (measured as half of the distance between the inflections points of the curve) also equal to one. More generally, a normal distribution results from exponentiation of a quadratic function (just as an exponential distribution results from exponentiation of a linear function):
1 (2) This yields the classic “bell curve” shape (provided that a < 0 so that the quadratic function is concave). One can adjust a to control the “width” of the bell, then adjust b to move the central peak of the bell along the x-axis, and finally adjust c to control the “height” of the bell. For f(x) to be a true probability density function over R, one (which is must choose c such that only possible when a < 0). Rather than using coefficients a, b, and c, it is far more common to describe a normal distribution by its mean 3 = −b/(2a) and variance 12 = −1/(2a). Changing to these new parameters allows us to rewrite the probability density function in a convenient standard form, (3)
simplest and by far the most intuitive method is that of calculating the Euclidean Distance between two items. d(p,q) = (4) Where d (p,q) is the Euclidean distance between two ndimensional variables p & q. This distance measure has a straightforward geometric interpretation, is easy to code and is fast to calculate, but it has two basic drawbacks: First, the Euclidean distance is extremely sensitive to the scales of the variables involved. In geometric situations, all variables are measured in the same units of length. With other data, though, this is likely not the case. In most practical cases Modeling problems deal with variables, which have very different scales, which are not comparable. Second, the Euclidean distance is blind to correlated variables. Consider a hypothetical data set containing 5 variables, where one variable is an exact duplicate of one of the others. The copied variable and its twin are thus completely correlated. Yet, Euclidean distance has no means of taking into account that the copy brings no new information, and will essentially weight the copied variable more heavily in its calculations than the other variables. This is explained in the following example: Consider two points A and B are equally distant from the center µ of the distribution as shown in figure 1.
Notice that for a standard normal distribution, 3 = 0 and 12 = 1. The last part of the equation above shows that any other normal distribution can be regarded as a version of the standard normal distribution that has been stretched horizontally by a factor 1 and then translated rightward by a distance 3. Thus, 3 specifies the position of the bell curve’s central peak, and 1 specifies the “width” of the bell curve. Figure 2. Euclidean Distance Measure Yet, it seems inappropriate to say that they occupy “equivalent” positions with respect to O as: A is in a low-density region, while B is in a highdensity region. So, in a situation like this one, the usual Euclidian distance d ²(A, µ) = 4I (oi - µ i)² (5) Figure 1 “Bell Curve” B. Mahalanobis Distance Function Many data mining and pattern clustering tasks involve calculating abstract "distances" between items or collections of items. Some modeling algorithms, such as k-nearest neighbors or radial basis function neural networks, make direct use of multivariate distances. The
Does not seem to be the right tool for measuring the “distance” of a point to the center of the distribution. We could instead consider as “equally distant from the mean” two points with the same probability density: this would make them equally probable when drawing observations from the distribution. Suppose now that the distribution is multi-normal. Because of the functional form of the multivariate normal distribution, these two points would lead to the same value of the quantity: 2
1
ERTAI-2010
Demarcation of Utterance Boundaries in Speech Signals Using Mahalanobis Distance Function as a Sample Clustering Technique
1 D ² = (x - µ)’4-1(x - µ) (6) With the covariance matrix 4 of the distribution. D is called the Mahalanobis distance of the point x to the mean µ of the distribution. The Mahalanobis distance takes into account the covariance among the variables in calculating distances. With this measure, the problems of scale and correlation inherent in the Euclidean distance are no longer an issue.
B. Sample Clustering based on 3 2 Decision Rule The central limit theorem states that the distribution of a sum of many independent, identically distributed random variables tends towards the famous "bell-shaped" normal distribution with a pdf of:
For case of a uni-dimensional variable the Mahalanobis Distance is defined by the following formula:
111
D=
(7)
In the following section we discuss the use of Mahalanobis Distance Function as a basis of end-point detection/alignment of Semi-continuous Speech Signals.
III. END-POINT ALIGNMENT USING MAHALANOBIS DISTANCE & 3-1 RULE
1111(10) Where 3 is the arithmetic mean of the sample. The standard deviation therefore is simply a scaling variable that adjusts how broad the curve will be, though also appears in the normalizing constant to keep the distribution normalized for different widths. In statistics, the three-sigma rule, or empirical rule, or 68-95-99.7 rule, states that for a normal distribution, nearly all values lie within 3 standard deviations of the mean. If a data distribution is normal then about 68% of the data values are within 1 standard deviation of the mean (mathematically, 3 ± 1, where 3 is the arithmetic mean), about 95% are within two standard deviations (3 ± 21), and about 99.7% lie within 3 standard deviations (3 ± 31). This is known as the 3-1 rule or 68-95-99.7 rule, or the empirical rule.
Assume that the ambience noise present in the captured signal is Gaussian in nature, then the unidimensional Mahalanobis distance function can be used to extract the voiced part from the signal. The Algorithm is divided into two parts: Background Noise Calibration and Sample Clustering based on 3 1 Decision Rule. A.
Calibration of Background noise The initial process involved is that of calibration of ambient noise parameters. The parameters considered are mean and standard deviation values of each sample. At first a 200~300 msec window is chosen to find out the parameters of the Gaussian distribution; the time duration of the window is chosen considering the fact that the speaker will take more than 200~300 msec to initiate speaking after recording starts. For a speech signal recorded at sampling frequency of 16 KHz it amounts to 3200 samples worth of data. If 3sil and 1sil are the mean and the standard deviation respectively of initial silence, then:
1sil =
1sil =
(8) (9)
Where s[i] is the instantaneous sample and N = 3200 samples These values viz. 3sil and 1sil characterize the background noise.
Figure 3. Normal Distribution
Having defined the uni-dimensional Mahalanobis distance function as
d=
(11)
It is evident from the aforementioned 3 rule that there is a probability of 99.7% that the distance, d will be less than 3. We may now proceed with sample clustering as follows: The entire speech recording is divided into nonoverlapping frames of 25 msec. This is done based on the premise that all speech signals are quasi-stationary in nature, meaning, when a speech signal is examined over a sufficiently short period of time (between 10 and 100 msec), its characteristics are fairly stationary. For each frame thus created, beginning from the first sample, proceed to the last sample of the speech recording, 3
1
ERTAI-2010
Demarcation of Utterance Boundaries in Speech Signals Using Mahalanobis Distance Function as a Sample Clustering Technique
1 calculating for each sample s[i], and the Mahalanobis distance
d=
Fig. 5 and 6 denote the utterance demarcation achieved by using Energy and ZCR Thresholds, and using Mahalanobis Distance Technique, respectively.
(12)
If d > 3, then the sample is said to belong to the voiced segment of the signal, otherwise it is treated as silence/noise. Repeat the process for each frame. The last frame if incomplete is zero-padded. An intuitive annotation method is to mark each silence/noise sample thus obtained as S and each voiced sample as V. Now in place of the original speech signal we have an annotated stream of samples marked as S or V. Given that the frames are non-overlapping it is safe to assume that frames in which number of V marked samples is greater than S marked samples, the entire frame logically belongs to the Voiced segment. The converse statement for majority of S marked samples also holds true. Retain all frames for which number of V marked samples are greater than S marked samples. Such frames constitute the voiced/ speech segment of the recording and by retrieving the starting and end indices of the frames we have the end-points of each voiced segments of speech. The aforementioned method is fairly intuitive and easy to implement.
Figure 5. Segmentation boundaries given by STE based method.
IV. EXPERIMENT RESULTS In this section we discuss the performance of two different approaches to the problem of Utterance Demarcation in Speech. At first we implemented the algorithm described in [2] that makes use of STE based methods of Energy & Zero Crossings Rate Thresholds. The system thus implemented was tested for performance accuracy using speech samples of a male speaker but with different speaking style; one set of recordings with semicontinuous speech. Each recording in this case had words separated temporally by order of 50-60 milliseconds. The other set contained recordings of same text but with a continuous speaking style. The following figure is a waveform for utterance “I work in CDAC” in semicontinuous mode.
Figure 6. Segmentation boundaries given by Mahalanobis Distance measure based method. It shall be noted that Fig. 6 depicts more regions than Fig. 5, owing to the fact that the Mahalanobis Distance Technique aims at separating the silence regions and utterance regions as separate clusters. The experiment data consisted of 10 recordings for each speaking style. Table 1 gives the accuracy obtained with either of the aforementioned techniques. Another observation of interest is the fact that it was found that the choice of frame length had a significant impact on the performance of Mahalanobis Distance based technique.
TABLE I EXPERIMENT RESULTS Accuracy Comparison Method 1 Speaking Rate 1
Figure 4. Recording of “I work in CDAC”
STE
MAHALANOBIS
Semi-continuous
75.4 %
87.2%
Continuous
37.2 %
44.5% 4
1
ERTAI-2010
Demarcation of Utterance Boundaries in Speech Signals Using Mahalanobis Distance Function as a Sample Clustering Technique
1 V. CONCLUSION This paper described the implementation of Utterance demarcation based on Mahalanobis Distance Function in order to overcome the problem of demarcation of utterances from background noise. This is achieved through the implementation of an end-point detection algorithm employing uni-dimensional MahalanobisDistance function governed by 3-1 rule, as a sample clustering mechanism to determine whether particular frames of speech signal belong to silence or to voiced regions. Finally, a comparison of accuracy achieved in both the cases is presented, that depicts the superior performance of Mahalanobis Distance Measure based technique for utterance demarcation.
REFERENCES [1] Ayaz Keerio, Bhargav Kumar Mitra, Philip Birch, Rupert
[2] [3] [4] [5] [6]
Young, and Chris Chatwin, “On Preprocessing of Speech Signals,” International Journal of Signal Processing, Vol. 5, no. 3, pp. 216-222, February 2009. L. R. Rabiner and M. R. Sambur, "An algorithm for determining the endpoints of isolated utterances," Bell Syst. Tech. J., Vol. 54, pp. 297—315, Feb. 1975. Rabiner. L. R., and Juang. B. H., “Fundamentals of Speech Recognition,” AT&T, 1993, Prentice-Hall, Inc. Rabiner. L. R., Schafer. R. W., “Digital Processing of Speech Signals,” First Edition, Prentice-Hall. Duda R. O., Hart. P. E, Strok. D. G., “Pattern Classification,” Second Edition, John Wiley and Sons Inc., 2001. L.R. Rabiner and M.R. Sambur, “An Algorithm for Determining the Endpoints for Isolated Utterances,” The Bell System Technical Journal, Vol. 54, No. 2
5
1
ERTAI-2010
Qualitative Spatial Representation of Direction of Motion
Qualitative Spatial Representation of Direction of Motion Rupam Baruah 1,2
Shyamanta M. Hazarika 2
1. Computer Science and Engineering Jorhat Engineering college Jorhat-7, Assam, India [email protected]
2. Computer Science and Engineering Tezpur University Napam, Tezpur, Assam, India [email protected]
Abstract— In many applications of cognitive vision, it is necessary to describe motion of spatial entities in a qualitative way. In this paper, we present a framework based on qualitative spatial reasoning for describing relative direction of spatial entities in motion. Spatial entities are represented by minimum bounding rectangles and qualitative relations are defined for direction of entities. Keywords-Knowledge representation, QSR, JEPD ,composition
I.
INTRODUCTION
Representation of and reasoning with space and time in a qualitative manner has been an active area of research within the artificial intelligence (AI) community. Qualitative Spatial Reasoning (QSR) strives to provide calculi that allow a machine to represent and reason with spatial entities without resort to the traditional quantitative techniques. Qualitative spatial representations addressing many different aspects of space including topology, orientation, shape, size and distance have been put forward. An account of work done in Qualitative Spatial Reasoning is surveyed in [1]. Different aspects of space that have been treated in a qualitative manner are topology, cardinal direction, size, distance, shape and orientation etc. Another important aspect is the motion of entities in space. In any real application of qualitative spatial reasoning, we will encounter spatial entities that move around in space. So, it is important to be able to describe this movement in a qualitative manner. Inherent in motion is the concept of direction. If the entity is modeled as one dimensional, then it can well be considered as an interval in Allen’s interval algebra [2]. Such an entity can have two possible directions of motion. Renz [3] has proposed a formalism called directed interval algebra for such entities. Dipole relation algebra [4] is another framework if we consider our spatial entities as directed line segments. In this paper, we propose a framework for qualitative spatial representation of direction of motion of two spatial entities. In many applications of QSR like GIS, cognitive vision etc., spatial entities are two dimensional. In our representation, we consider entities that are two dimensional. Each entity is modeled by its Minimum Bounding Rectangle (MBR). The framework proposed here can be used for describing some events in cognitive vision and GIS applications in a qualitative manner. In such applications, sometimes a qualitative description of events arising out of motion of spatial entities is necessary. For example, if we consider navigational domain as our specific problem domain and consider vehicles as our spatial entities, then
Figure 1. MBR of a spatial entity
motion events that we may like to describe in a qualitative manner may one vehicle overtaking another on left, one vehicle following another etc. QSR is the right tool here as it is very close to commonsense reasoning. The challenge here is to develop qualitative constraint calculi with the help of which one can describe the relationship between spatial entities in motion. Our framework can be used in connection with an orientation calculus to describe such events. Rest of the paper is organised as follows: in section 2, concept of relative direction is introduced, in section 3, direction relations are defined, in section 4, composition and conceptual dependency of relations explained and finally in section 5, we conclude the paper with pointers to future work and further research. II.
RELATIVE DIRECTION
A. Representation of spatial entity As mentioned earlier, spatial entities considered are two dimensional. Each entity is represented by its Minimum Bounding Rectangle (MBR) as shown in figure 1. Representation of two dimensional spatial entity by MBR is reported in literature. Orientation of extended spatial entities, represented by their MBRs, has been reported in the work of Goyal and Egenhofer [5] and Skiadopolous and Koubarakis [6]. The sides of the MBR are parallel to the axes of an orthogonal basis in a two dimensional euclidean space. The constraint of sides being parallel to the axes could introduce some loss of information vis a vis actual direction of moion. But such loss is inherent in qualitative abstractions. B. Intrinsic direction of motion In order to represent intrinsic direction of motion of a spatial entity, the MBR is divided into four tiles as shown in figure 2.
1
ERTAI-2010
Qualitative Spatial Representation of Direction of Motion
A
B
1F
2
1
2F
1F
2
1
2
4B
3
4
3B
4B
3
4F
3B
C
1
2
B: Reference
O 4
A: Primary
D:Primary
3 Figure 4.
D
C:Reference
E
‘Same’ and ‘RL’ relation
F
Figure 2. An MBR divided into tiles
The front of the entity is abstracted by a point and same is the case for the back. An intrinsic direction of motion of the entity can be specified by considering the tile in which the front lies and the tile in which the back lies. For example, if the front lies in tile number 1 and the back lies in tile number 4, then the entity is heading along positive Y-axis direction. We assume that entities move in the direction of their front. But if the front is in tile 3 and the back is in tile 4, then the entity is heading along positive X-axis direction. We need to formalise these ideas. The intrinsic direction of an entity is specified as (mF,nB) where m and n are tile numbers such that 1 <= m, n <= 4 and m 1 n. The letter F means that the front is in tile m and the letter B means that the back is in tile n. For example, if the intrinsic orientation is (1F, 4B), the entity is directed along positive Y-axis. We consider four major directions of motion. These are along the positive and negative directions of the axes of projection. These major directions are made finer by considering the cases where the entity may be inclined to the axis. In figure 3, we show these possibilities for an entity that is heading along positive Y-axis. The entities A and B are inline with the positive Y-axis as indicated by their front and back. The entity C is directed along positive Y-axis but is inclined to the left. The entity D is inclined to the right. It is easy to see that this inclination spans an angle of 90 degrees i.e. the tilt to the left or to the right can be in any direction in a span of 90 degree angle. Moreover, an entity can not change its direction from positive Y-axis to, say, positive X-axis without being inclined to the right. This becomes important in defining conceptual neighbourhood of relations later on. The cases where the front or the back lies on one of the line segments of the MBR can be treated as other cases by defining an appropriate
1F
2
1
2F
1F
4B
3
4
3B
4
2 3B
1
2F
4B
3
1F
2
1B
2F
1F
2
1
2B
4B
3
4
3
4B
3
4
3F
P: Reference Q: Primary
B
C
Figure 3. Inline with and inclined to positive Y-axis
S: Primary
Figure 5. ‘LR’ and ‘Opposite’ relation
C. Relative Direction of Motion Here, two spatial entities are involved. We introduce four keywords, namely, ‘Same’, ‘Opposite’, ‘LeftToRight’ and ‘RightToLeft’ for handling relative direction of two spatial entities. ‘LeftToRight’ is abbreviated as LR and ‘RightToLeft’ is abbreviated as RL. ‘Same’ means that the two entities are moving in the same direction, ‘Opposite’ means they are in opposite direction, LR means the primary entity is having a left to right direction with respect to the reference and RL means that the primary is having a right to left direction with respect to the reference. It is important to note that in relative direction, one needs to consider the intrinsic direction of each of the spatial entities involved. We represent a relative direction relation as a set of quadruples. Each quadruple is of the form (pF,qB,rF,sB), where p,q,r,s are tile numbers and the intrinsic direction of the primary is (pF,qB) and that of the reference is (rF,sB). In figures 4 and 5, a few cases of the relative direction relations are shown. A and B are both moving along the ‘Same’ direction i.e. the positive Y-axis direction. Movement of D is from right to left with respect to C. Movement of Q is in left to right direction and that of S is in opposite direction. When we say that the primary is in, say, left to right direction, the position of the primary is not important. It may be in any spatial orientation with respect to the reference. The consideration here is their direction of motion only. 1F 4
A
R: Reference
2
1
2F
1F
2
1F
3B
4
3B
4B
3
4
2 3B
D B:Reference
A:Primary
C:Reference
D:Primary
Figure 6. ‘SameLeft’ and ‘LeftSame’ relation
2
ERTAI-2010
1B 4
Qualitative Spatial Representation of Direction of Motion
1
2F 3
A:Primary
4B
2F 3
B:Reference
Figure 7.
1B 4
2
1
2F
3F
4
3B
D:Primary
C:Referecne
‘LRRight’ and ‘LeftOpposite’ relation
These major relations need to be refined by considering the cases where one or both of the spatial entities may be inclined. We use two keywords for handling this. The first one is for the primary and the second one is for the reference. The inclination is described either by ‘Left’ or by ‘Right’. For example, let us consider two relations ‘LeftSame’ and ‘SameLeft’. In the first, primary is tilted to the left and in the second, reference is inclined to the left. Both are moving in the same direction. Again in the relation ‘LRLeft’, the primary is in Left to Right direction with respect to the reference and the reference is inclined to the left. In figures 6 and 7, we illustrate a few scenarios for relative direction relations. It is important to note that these figures do not cover all the possibilities of the relations depicted. In figure 6, A and B are moving in the same direction but the reference is inclined to the left. So the relation is ‘SameLeft’. Since D is inclined to the left and it is the primary entity, the relation between D and C is ‘LeftSame’. In figure 7, the primary A is in ‘LR’ direction but the reference is tilted to the right. So the relation is A ‘LRRight’ B. D and C are in opposite direction with the primary being tilted to the left. So, the relative direction is ‘LeftOpposite’. III.
14) LeftLR (L-LR): Left to Right direction, primary inclined to left 15) RightLR (R-LR): Left to Right direction, primary inclined to left 16) LRLeft (LR-L): Left to Right direction, reference inclined to left 17) LRRight (LR-R): Left to Right direction, reference inclined to right 18) LRDiagonal (LRD): Left to Right direction, both inclined 19) RL: Right to Left direction 20) LeftRL (L-RL): Right to Left direction, primary inclined to left 21) RightRL (R-RL): Right to Left direction, primary inclined to right 22) RLLeft (RL-L): Right to Left direction, reference inclined to left 23) RLRight (RL-R): Right to Left direction, reference inclined to right 24) RLDiagonal (RLD): Right to Left direction, both inclined We have introduced a notation where we represent relative direction of two entities as a quadruple. Each of the above relation can be defined as a set of quadruples. Let us take the example of ‘LeftOpposite’ relation. When we say that the primary is opposite in direction, the first thing we need to consider is the direction of the reference. If we assume that the reference is directed along the positive Y-axis, then the primary has to be directed along negative Y-axis. If the reference is directed along negative X-axis, then the primary has to be directed along positive X-axis. For the first case, we will have two quadruples that are (3F,1B,1F,4B) and (3F,1B,2F,3B). For the second case, the quadruples are (4F,2B,2F,1B) and (4F,2B,3F,4B). Taking into consideration the other two cases, the relation ‘LeftOpposite’ will be:
DEFINITION OF DIRECTION RELATIONS
Below, we enumerate relative direction relations along with an abbreviation for each relation. 1)Same (S): Both primary and reference move in the same direction. 2)LeftSame (LS):Same direction, primary inclined to left 3)RightSame (RS):Same direction, primary inclined to right 4)SameLeft (SL):Same direction, reference inclined to left 5)SameRight (SR):Same direction, reference inclined to right 6)SameDiagonal (SD):Same direction, both inclined 7) Opposite (O): Both are moving in opposite direction 8) LeftOpposite (LO): Opposite direction, primary inclined to left 9) RightOpposite(RO): Opposite direction, primary inclined to right 10)OppositeLeft(OL): Opposite direction, reference inclined to left 11) OppositeRight(OR): Opposite direction, reference inclined to right 12) OppositeDiagonal(OD): Opposite direction, both inclined 13) LR: Left to Right direction
{(2F,4B,1F,2B),(2F,4B,4F,3B),(4F,2B,2F,1B), (4F,2B,3F,4B),(3F,1B,1F,4B),(3F,1B,2F,3B), (1F,3B,4F,1B), (1F,3B,3F,2B)} In a similar way, other relations can be defined. The set of relative direction relations defined above is closed under composition and converse. ‘Same’ is the identity relation. Moreover, the relations are Jointly Exhaustive and Pairwise Disjoint (JEPD). It is JE because all the 16 X 16 = 256 possibilities are included in the definition of the relations. It is PD because the intersection of relations, defined as set of quadruples, is the null set. IV.
COMPOSITION AND CONCEPTUAL NEIGHBOURHOOD
A. Composition Composition of binary relations is an important operations in qualitative spatial reasoning. In table I, we present the composition of all direction relations with ‘Same’ and ‘Opposite’ relations. Some combinations are not possible and that is the reason we have blank entries in the table. For example we take the composition of ‘Same’ with ‘LeftOpposite’. Let x, y and z be three spatial entities such that x ‘Same’ y and y ‘LeftOpposite’ z. In this case, when the ‘Same’ relation holds it implies that both x and y have inclined orientation with one of axis. But the
3
ERTAI-2010
Qualitative Spatial Representation of Direction of Motion
TABLE I. s
COMPOSITION OF DIRECTION RELATIONS WITH ‘SAME’ AND ‘OPPOSITE’ ls
rs
sl
sr
sd
o
lo
ro
ol
or
s
s
sl
sr
s
ol
or
ls
ls
sd
sd
lo
od
od
ro
od
od
rs s
sl
sl sr sl sr sd
s
sr ls rs
sd
ls rs
o
ol or ol or od
o lo ro
lo ro
o
o
ol
or
s
sl
sr
lo
lo
od
od
ls
sd
sd
ro
ro
od
od
rs
sd
sd
o
ol
ol or ol or od
o
or lo ro
od
lo ro
s
sl sr sl sr sd
s ls rs
ls rs
lr
lr
lr-l
lr-r
rl
rl-l
rl-r
l-lr
l-lr
lrd
lrd
l-rl
rld
rld
r-lr
r-lr
lrd
lrd
r-rl
rld
rld
lr
lr-l
lr-l lr-r lr-l lr-r lrd
lr
lr-r l-lr r-lr
lrd
l-lr r-lr
rl
rl-l rl-r rl-l rl-r rld
rl l-rl r-rl
l-rl r-rl
rl
rl
rl-l
rl-r
lr
lr-l
lr-r
l-rl
l-rl
rld
rld
l-lr
lrd
lrd
r-rl
r-rl
rld
rld
r-lr
lrd
lrd
rl-l
rl rl
rl-r rld
rl-l rl-r
rl-l rl-r
‘LeftOpposite’ relation demands that y be inclined to the left with respect to some axis. The entity y can not be in B. Conceptual Neighbourhood A conceptual neighbour of a relation R are those relations that can hold between the spatial entities at next time point. For example, we assume that ‘Same’ relation holds between two entities x and y at any point of time. Because of movement of one or both the entities, this relation may change at subsequent time point. But after ‘Same’, the next relation that holds between x and y can not be ‘Opposite’. Let x be the primary and let y be the reference. The possibilities are: x becomes inclined to the left or to the
rl-l rl-r rl-l rl-r rld
lr lr l-lr r-lr
l-lr r-lr
od
lr-l lr-r lr-l lr-r lrd
right or y becomes inclined to the left or to the right or both become inclined. Accordingly, the possible relations that may hold after ‘Same’ are LeftSame, RightSame, SameLeft, SameRight or SameDiagonal. So, these relations are conceptual neighbours of ‘Same’. The conceptual dependency is expressed in the form of a graph where each node represents a relation and an edge is drawn between two nodes that are conceptual neighbours. Conceptual neighbourhoods express a notion of spatiotemporal continuity. In figure 8, we show the conceptual neighbourhood of ‘OppositeDiagonal’ relation.
4
ERTAI-2010
Qualitative Spatial Representation of Direction of Motion
TABLE II Composition of direction relations with ‘LR’ and ‘RL’ relations
s
lr lr
ls rs
l-lr
r-lr
lr-l lr-l
lr-r lr-r
l-lr
lrd
lrd
r-lr
lrd
lrd
lr
sl
l-lr r-lr
sd
rl rl
rl-l rl-l
rl-r rl-r
l-rl
rld
rld
r-rl
rld
rld
lr-l lr-r lr-l lr-r lrd
lr
sr
lrd
l-lr r-lr
l-rl
r-rl
rl
rl-l rl-r rl-l rl-r rld
rl l-rl r-rl
l-rl r-rl
o
rl
rl-l
rl-r
lr
lr-l
lr-r
lo
l-rl
rld
rld
l-lr
lrd
lrd
ro
r-rl
rld
rld
r-lr
lrd
lrd
rl
ol
rl
or l-rl r-rl
od
l-rl r-rl
lr l-lr
rl-l rl-r rl-l rl-r rld
o lo
r-lr
lr
lr-l lr-r lr-l lr-r lrd
lr l-lr r-lr
rld
l-lr r-lr
ol
or
s
sl
sr
od
od
ls
sd
sd
od
od
rs
sd
sd
ro o
lr-l
o
lr-r lo ro
lrd
lo ro
s
sl sr sl sr sd
s ls rs
ls rs
sl
sr
o
ol
or
l-rl
s ls
sd
sd
lo
od
od
r-rl
rs
sd
sd
ro
od
od
rl
rl-l
s s
rl-r rld
V.
ol or ol or od
ls rs
ls rs
CONCLUSION AND FUTURE WORK
In this paper, we have proposed a knowledge representation technique for describing motion of two dimensional spatial entities modeled as MBRs. Our considered two-dimensional entities modeled by their MBRs. A set of JEPD relations is defined and this set is closed under composition and converse with an identity relation ‘Same’. The framework developed can be used in cognitive vision applications where one needs to describe motion events of spatial entities such as vehicles, human beings etc. in a qualitative way. It is important to note that this formalism can be used in conjunction with an orientation calculus for describing motion events in a qualitative way. For example, if we consider a simple
sl sr sl sr sd
o o lo ro
lo ro
ol or ol or od
event of one vehicle following another, then it is clear that the vehicle that is being followed must in the front of the other. This relationship can be expressed with the help of a qualitative orientation calculus. Our formalism can be used to describe the fact that both the vehicles are moving in the same direction. Future work include development of reasoning formalism for the framework, identification of tractable subclasses of the relations, exploration of the possibility to make the relations finer etc. REFERENCES [1] A.G. Cohn and S.M.Hazarika, “Qualitative spatial representation and reasoning: an overview”, Fundamenta Informaticae, 46 (1-2), pp. 1-29, 2001.
5
ERTAI-2010
Qualitative Spatial Representation of Direction of Motion
[2] J.F.Allen, “Maintaining knowledge about temporal intervals”, Communication of the ACM, vol. 26(11), pp. 832-843, 1983. [3] J. Renz, “A spatial odyssey of the interval algebra: 1.directed intervals”, In Proceedings of the 17th International Joint Conference on Artificial Intelligence, pp. 51–56, 2001. [4] R. Moratz, J. Renz and D. Wolter, “Qualitative spatial reasoning about line segments”, In Proceedings of ECAI 2000, pp. 234-238, 2000. [5] R.K. Goyal and M.J. Egenhofer, “Similarity of cardinal directions”, Advances in Spatial and Temporal Databases, .Lecture Notes in Computer Science, 2121, pp. 36-58. Jensen, C.S., Schneider, M., Seeger, B., Tsotras,V. J. Eds. Springer-Verlag Berlin, Heidelberg, New York [6] S. Skiadopoulos and M. Koubarakis, “On the consistency of cardinal direction constraints”, Artificial Intelligence, vol. 163(1), pp. 91–135, 2005.
lo
rl-r
ro
rl-l
ol
r-rl od
l-rl or
lr-r l-lr
r-lr
Figure 8.
lr-l
Conceptual dependency of ‘OppositeDiagonal’
6
ERTAI-2010
Environment And Embeddedness
Environment And Embeddedness Gagan Deep Kaur Department of Humanities & Social Sciences IIT Bombay India [email protected]
Abstract — This paper seeks to examine the notion of embeddedness from overlapping perspectives of AI and social sciences. It is emphasized that the notion of embeddedness as conceived by two approaches depends upon their different concepts of environment. Once the latter gets clear, the conceptual confusion surrounding embeddedness too could be cleared. Regarding embeddedness, whereas AI puts more emphasis on situatedness of the system, drawing inputs and giving output to complete a task, the social sciences’ perspective sees embeddedness as a dynamic interaction with the environment which is although context-specific (but not task-specific as that of artificial creatures), but admits a wide range of goals and behaviors from agent’s point of view. This interaction is carried out at various levels of which physical interaction is just a part. A distinguishing feature of this interaction is that it is tinged with socio-cultural facts which make it more dynamic than interaction of the artificial systems. In the paper are elucidated then the difference between the two approaches with respect to notion of environment, how embeddedness is conceived in both and enquired if, and in what respects, artificial agents can be embedded. Keywords- environment, embeddedness,
Although, the general tendency in Alife literature is to equate embeddedness with situatedness, this equation is problematic for two reasons: 1) It makes both the concepts restrictive, for to arrive at an equation one-to-one mapping is necessary (e.g. physical embodiment/situatedness here) 2) It overlooks the avenues from where the two can be profitably viewed, and therefore, improved upon (only physical embodiment overlooks the social aspects involved in making sense of agent’s own body and world). The tendency has its roots in Brooks’ now classical description of his situated robots, as noted by Dobbyn and Stuart, “The robots are situated in the world – they do not deal with abstract descriptions, but with here and now of the world directly influencing the behavior of the system” [2]. The problem with this view is that it takes into account only “here and now” aspects of the interaction. This makes the interaction spatiotemporally constrained and thus presents a restrictive view of agency. It takes environment to be a static entity admitting of no radical changes which could affect the future course of action. No doubt, that environment in virtual worlds is a carefully chosen pattern of different coordinates. Consequently, here-and-now approach, presents a restrictive view of embeddedness. On the other hand, human agency is not constrained to spatiotemporal constraints. It presupposes the ability to transcend the here-and-now fixations through imagination
and planning. Without this fundamental ability, the agents are not termed ‘agents’ (in the sense of initiators of actions to whom reward/punishment can be attributed) at all. That is why animals and infants are not called Agents. Because of this ability, a human agent is able to deal with the unpredictability of the environment. This ability is fundamental with humans required for social interaction. Without the dealing mechanisms necessary for this, like imagination, individual’s ability to carry on the social interaction is hampered to a great degree. Neurological disorders like Autism and Asperger’s Syndrome are characteristic of this lack. The environment of biological cognitive systems, esp. humans, is not static and pre-designed like virtual environments or even carefully assembled as that of Animats. It is complex, ambiguous, unpredictable and admitting multifaceted interpretation by the observer. This is due to the socio-cultural coloring of human environment which consists apart from physical objects, socio-cultural phenomena like customs, traditions, governance, history, law, folk-mythology, education and of course, other human agents. The varying nature of this phenomena from place to place and time to time makes the environment ambiguous and unpredictable. Even routine transactions become highly context-dependent and demand background features to be considered while dealing with them or their interpretation by the observers. Without this socio-cultural backdrop, human embeddedness becomes superficial and mechanistic. This is because, more than dealing with the environment at physical level, most of the human transactions are done within a particular socio-cultural setting. Whereas some actions are uniform across the globe owing to their being at raw physical level, many actions are socio-culturally controlled and can be interpreted within that particular context only. 1 Gravity-controlled movements like walking upright, universally valid psychological reactions like trying to avoid pain and seek pleasure etc are uniform across the globe; whereas bowing or folding hands in greeting are socio-culturally shaped, which can be interpreted in a particular setting only. In this example, Chinese or Indian. Contexts add meaning to the actions and make the embeddedness enriched. Human beings’ environment is causally linked to them at two levels – ontological and cognitive. Ontologically, it provides a backdrop without which human being viewed as agents cannot exist [4]. Human agency cannot function in isolation. Action presupposes actors/objects to which action is oriented. Self presupposes agency and agency is always embedded. This makes human agents as socially 1
I am indebted to Prof. P. R. Bhat, IIT Bombay, for this suggestion.
1
ERTAI-2010
embedded agents. Without this backdrop, human selfhood is reduced to disembodied existence which is hallmark of the some of the transcendent traditions of the oriental mysticism or recent advents in cyberspace. Self, the characterizing feature of human systems, is necessarily an embedded agent. All the human transactions are possible within this socio-culturally tinged environment only. Environment aids in the neat location of Self and provides a point of reference from which world can be apprehended. Without the egocentric space that it provides, Self cannot know even itself [6]. At cognitive level, apart from providing this backdrop, environment is causally linked to the human cognitive processes. Human cognition is deeply shaped and influenced by the environment it is embedded in. Many higher cognitive processes like thinking, remembering, perception are causally linked to sources outside the skinskull boundary of the agent so much so that many of the processes literally leak out into the world [3]. Environment, with the affordances that it provides, extends thus the cognitive abilities of the human beings. For example, when a blind man probes the space with his stick, his haptic perception is extended. The stick is not just a temporarily affixed tool to blind man’s hand, rather it is a part of his cognitive system. However, not every lay thing in the environment is part of the cognitive system. To sift out the relevant from the irrelevant aspects of environment required for cognition, Naoya Hirose makes an important difference between tools-before-use and tools-in-use, “Before use, the tool is a detached object of the environment, separated from the user’s body. Thus, the tool is treated as functional extension of the environment; it has its specific affordances and provides new opportunities for action. In contrast, a tool in use is not a mere object. The tool is treated as functional extension of the user; it plays a central role in extending the user’s effectivities to realize affordances of the environment” [7]. Thus, whereas the former can be seen as any other object in the environment, the latter because of its value for cognitive enhancement, become part of the cognitive system itself. On the other hand, the environment of artificial systems appears more or less a backdrop against which a creature does its job and fulfils the task at hand. Even if it receives input from the environment, these inputs, as Dobbyn and Stuart notes, are of very limited kind. That is, only task-relevant inputs are recognized and received and reacted to in the end. Other inputs are ignored, which makes the over-all behavior task-specific. The interaction achieved is thus at best reactive only. Agents sense their environment, apprehends certain stimuli according to the task they need to achieve, and react accordingly, but human systems, as we have seen, admit of a range of probable behaviors that can be applied in even in a stereotypical situation. Whereas an artificial agent may apply one or two different ways of picking up the soda cans depending upon the algorithm, human beings can perform a task in more than a dozen ways. Human beings, looking at their goals, may apply different number of tasks to get the desired result, or even change their goal, depending upon the situation. Human transactions are thus more goal-specific, than task-specific.
Environment And Embeddedness
Artificial agent’s embeddedness is, thus, entirely different from human beings’ embeddedness and restricted to task-relevant interaction. Task-relevancy is not the problem; the problem is that this ‘relevancy’ is not decided by the system itself. Which aspects of the environment are to be considered ‘relevant’ is the choice of the human designer entirely. The background ‘noise’ considered redundant by the system is decided so by the designer at the back, not the system itself. The point is that noise is noise for the designer, not the system. In human beings, which things from backdrop would become relevant depends upon the situation and agent’s take on it. Agent enjoys the freedom to pick up any aspect from the background and give it a meaning. Insensitivity to the background environment restricts the systems to taskspecific interaction only. Once this notion of environment is cleared, the problem comes to embeddedness of artificial systems. One question before proceeding on this should be – artificial systems are to be embedded in what? The choices here could be – embedding the artificial systems in their own environment, i.e. as artificial as them, like controlled laboratories, or simulated environments etc. and secondly, embedding the artificial systems in our environment – real, physical environment of human beings tinted with sociocultural facets. Confusing them with our environment might pose problem to notion of embeddedness itself. One such confusion arises by Russell & Norvig’s interpretation, “The so called situated movement aims to understand the workings of agents embedded in real environments with continuous sensory inputs. One of the most important environments for intelligent agents is the Internet” [9]. Going by the conditions necessary for embeddedness by Dobbyn & Stuart and even Stuart [10], whereby physical situatedness, among others, is foremost criteria for a system’s embeddedness, Russell & Norvig’s interpretation raise a paradox as it explicitly makes the cyberenvironment as “one of the most important environments for intelligent agents.” The paradox is that if physical situatedness is that much necessary as laid down by various theorists including above, then virtual agents cannot be embedded for they lack physical embodiment, and if they are embedded as we saw in this definition, then physical embodiment is after all not that necessary! The distinction proposed above could help solve this paradox and by having clear understanding of the term embeddedness. In narrow terms, embeddedness can be defined as the system’s ability to interact with its environment (whether artificial, controlled or realunpredictable), receive inputs from it and give the feedback. On this narrow interpretation, virtual agents are embedded because they receive inputs from their environment and act (or react) accordingly (AI perspective). The wider interpretation, however, involves interaction on a much broader scale. As we saw earlier, human transactions are socio-culturally tinged, the analysis of human interaction in its environment cannot be completed without taking these facts into consideration (Social Sciences perspective). Thus, on narrow interpretation, virtual agents would come out to be embedded, but on wider interpretation, owing to their impoverished environment and impoverished embeddedness, would not.
2
ERTAI-2010
The second horn of this dilemma, regarding the nonsignificance of physical embodiment, can be resolved by adopting a phenomenological stance to the problem of embodiment. Embeddedness, esp. human, starts with physical embodiment. The experiential existence presupposes the means to acquire that existence. Physical Embodiment thus is pre-supposed by Embeddedness. An organism can be embedded in its environment only via its physical body. Merlau-ponty’s insight here could be helpful, “my body is the pivot of the world… I know that objects have several facets because I could make a tour of inspection of them, and in that sense, I am conscious of the world through the medium of my body” [8]. Thus, physical body whereas provides a means to self-access (through Proprioception), on the other provides means to know the world as well. This was robustly advocated by Hubert Dreyfus later [5]. Coming back to the choices available for embedded-inwhat, almost everyone would agree that the aim is second. That is, the artificial systems need to be embedded in our environment, not just theirs. On that requirement then, the need of physical embodiment cannot be ruled out. Bodyneutrality can be maintained in virtual environments, but not in human environments. From the above discussion, some features of human embeddedness that comes to mind are: 1) Interaction at more than one level, viz. social, cultural of which physical is just a part. 2) Goal-specific, but not task-specific behaviors. Whereas tasks are more short-time, deadline-bound transactions, goals are more temporally extended. Some may even range from learning to ride cycling to maintaining survival in the long run. Whereas, formers ones are explicitly had latter ones are had unconsciously. Goal-specificity thus gives purpose to diachronic continuity of the Self. Task-specificity makes what Bratman calls, “time-slice agents” called to perform some task and then, as if, switched off, like some software agents, but human agents are “temporally extended” and goals give direction to them [1]. Between completion of one task and beginning of the other, a time-slice agent dies as if, but such is not in the case of goal-driven persisting agents like humans since every goal is embedded in a much larger network of overlapping goals, past and future, and continuity remained. This spatiotemporality was made an inseparable backdrop of human experience 250 years ago by Immanuel Kant and reasserted by phenomenologists like Husserl, Heidegger and MerlauPonty in 20th century. 3) Context-sensitivity, admitting changes in goals depending upon the situation. Once spatiotemporal fluidity
Environment And Embeddedness
of norms governing human actions is admitted, context becomes significant in initiating and interpreting actions and even altering prior goals radically. An agent can have this context-sensitivity only if it is able to detect subtle changes in the environment, interpret those changes according to specific norms, speculate the outcome of probable course of action. This, in turn, aids in further modification of goals. Unlike Dobbyn & Stuart and Stuart (2000), however, physical embodiment has been excluded out of this list as it is assumed to be the starting point already for embeddedness. Instead, context-sensitivity is included. In short, agent’s (be it biological or artificial) embeddedness, if it is not to be constrained to here and now fixations, would have to transcend the spatiotemporal limitations by think-before-act strategy which is a part and parcel of human agency. This embeddedness needs to be dynamically emerging from the background environment by causally linked to it. Multi-level interaction, goalspecific behaviors and context sensitivity may ensure this linkage. This is necessary also, as human environment is complex, rich and ambiguous as compared to artificially controlled environments. If the artificial agent need be embedded in this environment, then these requirements need be fulfilled. Thus, unless these conditions are satisfied, it seems unlikely that artificial agents could be embedded in our environment in the way we are. REFERENCES [1]
Bratman, Michael, “Reflection, Planning and Temporally Extended Agency”, The Philosophical Review, vol. 109, no. 1, p. 40, 2000 [2] Brooks, Rodney, “Intelligence Without Reason”, Proc. International Joint Conference on Artificial Intelligence (IJCAI 91), Vol. 1, Morgan Kaufmann, Aug. 1991, p. 571 [3] Clark, Andy and D. Chalmers, “The Extended Mind”, Analysis, vol. 58, pp. 10-53, 1998 [4] Dobbyn, Chris and S. Susan, “Self as Embedded Agent”, Minds and Machines, vol. 13, pp. 187-200, 2003 [5] Dreyfus, Hubert, What Computers Can’t Do?: The Limitations of Artificial Intelligence. Harper Colophon, 1979 [6] Gallagher, Shaun and A. J. Marcel, “Self in Contextualized Action”, Journal of Consciousness Studies, vol. 6, no. 4, pp 4-30, 1999 [7] Hirose, Naoya, “An Ecological Approach to Embodiment and Cognition”, Cognitive Systems Research, vol. 3, pp. 290-91, 2002 [8] Merlau Ponty, Maurice, Phenomenology of Perception. Routledge & Kegan Paul, London, p. 82, 1970 [9] Russell, Stuart, and P. Norvig, Artificial Intelligence: A Modern Approach, Prentice Hall, NJ, p. 27, 2003 [10] Stuart, Susan, “A Radical Notion of Embeddedness: A Logically Necessary Precondition For Agency And Self-Awareness”, Metaphilosophy, vol. 33, no. 2, pp. 98 – 109, 2002
3
ERTAI-2010
Artificial Intelligence in Gaming
Artificial Intelligence in Gaming Jeetendra Nihalani, Gokul Chandrasekaran, Rohan Narang, Mahsheed Eshraghi, and Shalini Bhatia Computer Engineering Department Thadomal Shahani Engineering College Mumbai, India [email protected], [email protected], [email protected], [email protected], [email protected]
Abstract—The game Theseus and Minotaur requires a player to help maneuver Theseus through a maze to an exit without being captured by Minotaur. For every step Theseus takes, Minotaur takes two using a fixed algorithm. It is time consuming and requires a lot of previous experience for a human to generate solvable mazes of an adequate difficulty level. This paper explains the modifications made to an already existing technique to generate these mazes using genetic algorithms. The technique is extended to help create mazes for a modified version of the game that includes an extra element called moving blocks. Keywords-artificial intelligence; automation; Theseus and Minotaur
I.
genetic
algorithms;
INTRODUCTION
Existing for centuries games have evolved with the people that play them. In this age of computers, remarkably intelligent game-play exists. The computer can now predict what move you will make next and counter that move. Techniques of artificial intelligence (AI) have influenced many fields of game play from the intelligence of the opponent to the creation of different levels of difficulty for users to enjoy. There are a variety of techniques that can be used to incorporate AI in games. One of the most elementary forms of AI uses uninformed search to come to a solution. These can broadly be categorized into breadth first search and depth first search. These algorithms use knowledge which is limited to the definition of the problem [1]. As an example, the minimax algorithm uses a depth first search to find the next best move for the computer. For the breadth first search technique memory requirements are a bigger problem than execution time. The time required to perform uniformed search increases exponentially with the size of the problem. Informed search goes one step further to use problem specific knowledge beyond the definition of the problem. It uses a heuristic function to find solutions more efficiently than an uninformed strategy. Informed search requires the computer to keep many nodes on the frontier in memory, and requires the programmer to choose a good heuristic function, which is not easy to formulate for many problems [2]. Rule-based systems are highly inefficient. In order to use just one rule, the system must examine all the rules. Despite the best intentions, hidden dependencies between rules may exist. This makes it hard to predict the effects of adding or deleting a rule [3]. Neural networks are also being used in gaming. The drawbacks of neural networks are that it is difficult to
understand and, at times, it is difficult to predict what the output of a neural network will be. The individual relations between the input and the output variables are not developed by engineering judgment. Hence, the model tends to be a black box or input/output table without analytical basis [4]. Yet another technique to simulate intelligence in gaming is the use of genetic algorithms. This paper illustrates how it can be used in developing mazes for the puzzle game Theseus and Minotaur. II.
THE PROBLEM
A. The Game Theseus and Minotaur is a puzzle that has been tremendously popular across the internet [5]. It is based on a Greek mythology in which there exists a beast, half man and half bull, called Minotaur who lives in a cave. The story goes that in the entire land there existed only one youth brave enough to tackle Minotaur. This youth, named Theseus, valiantly went into the cave and killed Minotaur. In the computer version of the game, Theseus is represented as a black dot in a maze and Minotaur as a red one. In this game, invented by Robert Abbott, the human player is given control of Theseus. The player can move up, down, left or right unless the direction he wishes to move is obstructed by a wall. For each move that is made by the player, Minotaur, the computer, makes two. In order to compute each of the two moves Minotaur must make, the computer follows the same procedure. It first checks if Minotaur will get closer to Theseus if it moves horizontally. If this is true and there is no wall obstructing Minotaur, the computer plays this move. Otherwise, the computer checks if Minotaur will be closer to Theseus if it moves vertically. If this is true and there is no wall obstructing Minotaur, the computer plays the move. Otherwise, the computer is forced to remain in its position. Armed with the knowledge of how Minotaur will move, a player is expected to maneuver himself to a designated exit without being caught by Minotaur. The algorithm that Minotaur follows allows the player to trap it, in strategic locations on the map, behind walls. B. Maze Generation The creation of mazes which require a player to think more and hence are difficult to play requires a lot of time and clever thinking on the part of a human designer. It is desirable to have this task automated so as to produce a variety of difficult mazes in a relatively short period of time. A brute force technique to generate mazes is
1
ERTAI-2010
Artificial Intelligence in Gaming
unsuitable since the search space, all possible mazes of a given size, is too large. Various techniques have been used before to automate maze generation. Toby Nelson has designed a program which generates mazes randomly, then refines the best ones by testing various mutations. The mazes are ranked according to measures like solution length and the number of false paths. In his Java implementation of the game, Nelson, added fourteen of his own mazes. This new set of mazes was generally easier than Abbott's original, though each still required several small leaps of logic. Nelson's first thirteen mazes provided a steady ramp of difficulty for solvers before they reached Abbott's original. And Nelson added a final maze. This maze, called The Dread Maze Fifteen, was far more complex than Abbott's. A similar program, designed by James W. Stephens, provides the basis for the many wonderful puzzles at PuzzleBeast [5]. Search techniques that move towards a maxima help reduce the number of possibities that are required be checked. Suchao Chaisilprungrueng and Hadi Moradi in the paper titled ‘Designing a Theseus-Minotaur game using Genetic Algorithms’ [6] have explained a technique used to automate maze generation. The technique of automated maze generation explained in this paper is different from that in the way that the fitness function and the process of selection of mazes are implemented. III.
GENETIC ALGORITHMS
Part of a larger group of algorithms called evolutionary algorithms the genetic algorithm technique is a search technique which mimics evolution. Given a problem, each possible solution to it is represented in the form of an individual in a population. Each individual in a population is given a fitness value based on its proximity to the required result. Individuals of this population are then used to produce the next generation of solutions. The next generation is created by selecting individuals with a probability directly proportional to their fitness, crossing them over and mutating the resultant individuals. With each passing generation the new set of solutions formed tends towards a good solution to the problem. The genetic algorithm thus finds a solution to a problem without checking every possible solution and hence reduces the number of states that are required to be searched. A. Individual In an implementation of a genetic algorithm, an individual is a possible solution to the problem statement. There are a variety of possible encodings of a solution. Traditionally, solutions are represented as strings of 0’s and 1’s, but other encodings are also possible [7]. In this project, an individual is one possible configuration of the maze. The starting position of Theseus, the starting position of Minotaur, the exit position, the configuration of horizontal walls and the configuration of vertical walls are encoded in a string of 0’s and 1’s. This string represents an individual. The length of an individual of a maze of a given size is fixed. The total cells in a maze are given by: Total Cells = number of rows * number of columns
The cells which are on the boundary of the maze consist of a wall or walls based on which boundary they are located (for example: the north bounding cell would consist of a north wall by default). These walls are not accounted for in the binary string since they must exist by default in every maze. Suppose there are ‘r’ rows and ‘c’ columns in a maze. The number vertical walls in a single row excluding the walls at the boundary can be given by (c-1). Therefore the total number of vertical walls is given by r*(c-1). Similarly the total number of horizontal walls can be calculated. Hence, the number of bits required to specify the horizontal and vertical wall locations can be calculated as follows: No. of bits for horizontal walls = number of columns * (number of rows-1) No. of bits for vertical walls = (number of columns-1)* number of rows The number of bits that are needed to specify the locations of Theseus, Minotaur, and the exit are calculated based on the total number of cells in the maze. For example: if the total number of cells in a maze is 8 then each of these positions can be specified by 3 bits. In case there are totally 6 cells, a minimum of 3 bits are needed to reference each of them distinctly. The strings ‘110’ and ‘111’ are left unused as the required number of cells has already been referenced. A value of the log2 of the total number of cells gives us the number of bits needed to represent any valid maze location. Position length = ceil( log2 (total number of cells) ) Therefore, the total number of bits in the binary string is given by: Total No. of bits = 3 * (Position length) + No. of bits for horizontal walls + No. of bits for vertical walls. B. Population Genetic algorithms are implemented by generating a random population of solutions to the problem and then evolving these solutions into better ones. Evolution is carried out till the desired solution is obtained or till a desired level of accuracy is achieved. In each generation, the fitness of every individual in the population is computed. Based on their fitness values multiple individuals from the current generation are selected, modified and placed in the next population. In this project, the initial population is a set of randomly created mazes. Parameters called probability of horizontal wall generation and probability of vertical wall generation are used to create the horizontal and vertical walls, represented by 0’s and 1’s, of individual mazes in the initial population. The nature of the game dictates that horizontal traps – horizontal walls that allow Theseus to trap Minotaur – are easier to perceive while vertical traps – vertical walls that allow Theseus to trap Minotaur – are more difficult to identify. Hence, by starting with a population of solutions having a large number of vertical walls it is intended to generate final solutions with the more difficult vertical traps.
2
ERTAI-2010
C. Fitness Function The most important aspect to consider while implementing genetic algorithms is the fitness function. Fitness represents the quality of a solution. The fitness function is problem dependent and should be designed keeping in mind various factors that influence the solution of a problem. To calculate the fitness of a maze, the following parameters are considered: --The number of moves Theseus makes (a1): A difficult maze requires Theseus to make more moves to reach the exit while a lesser number of moves signify a simpler maze. -- The number of moves Minotaur makes (a2): If Minotaur is stuck at a position for an extended period of time then it is not very difficult for Theseus to reach the exit. Hence, the fitness of a maze is directly proportional to this value. When a maze is generated such that Minotaur is trapped by walls on all four sides this parameter is 0 and hence, the fitness of the maze is low. This prevents such mazes from propagating to the next generation. --The number of moves Minotaur is static (a3): This parameter gives a simple count of the number of moves that Minotaur is trapped for. --The number of times Minotaur is trapped (a4): This parameter introduces an element of strategic planning on the part of Theseus as to where Minotaur must be trapped. --The number of false paths (a5): A false path is a series of steps which if made, does not result in a solution. With a large number of false paths, a larger look ahead is required to determine if a solution using that particular approach is possible. Based on the parameters stated above, the fitness of a maze is given by: Fitness function = k1 * a1 + k2 * a2 + k3 * a3 + k4 * a4 + a5 * k5 where kx = k1, k2, k3, k4, k5 are proportionality constants which are a product of a corresponding scaling factor and importance of the given parameter. Thus, kx = importance (impx) * scaling factor (sx) where ( x = 1, 2, 3, 4, 5 ) A scaling factor is used in order to normalize the values of all the parameters. Consider a fitness function comprising of parameters a1 and a2. Consider a maze, maze1 with a1 = 40 and a2 = 6 and consider another maze, maze2 with a1 = 30 and a2 = 9. If we consider only parameter a1, maze1 is better by a factor of 1.3333 (i.e. 40/30). Similarly, if we consider only a2, maze2 is better by a factor of 1.5 (i.e. 9/6). Thus if both these factors are equally important, maze2 is the better maze. It would be incorrect to say that maze1 has a higher fitness because its fitness is 40 + 6 = 46, which is greater than 30 + 9 = 39. This ambiguity is addressed by normalizing each parameter by dividing it by the maximum value of the parameter found amongst all the individuals in the population. The normalized parameters can then be multiplied by another constant called importance to determine the fitness value of the individual. It is important to note that this fitness formula helps calculate the fitness of an individual in a given population. This
Artificial Intelligence in Gaming
value is useful only to select a given individual from the current population. If the fitness of the individual must be calculated when it is put in another population, the normalization of parameters must be done with respect to the maximum value of each parameter present in the new population. D. Crossover After calculating the fitness of each individual in a population, the next step is to generate the next generation of individuals. Crossover is performed between binary string representations of two mazes. A random position is selected from which the right substrings of both mazes are interchanged. If any of the resultant mazes is not a valid maze, the previous step is performed again. Crossover between individuals is carried out by selecting two individuals for each crossover. Those individuals with a higher fitness have a greater chance of being selected to be crossed over. This is based on the principle of the survival of the fitness seen in nature. E. Mutation Each new individual created by crossing two parents may or may not be mutated based on the probability of mutation. Five random bits in the binary representation are changed to produce a mutated individual which is then checked to see if it represents a legal maze. If the maze is legal, the mutation stops. An example will demonstrate the meaning of a legal maze. Let the maze have sides of length six. Therefore, in order to represent each position on the maze we need 6 bits. Out of the 26 possible combinations, only 36 are valid. If, after mutation, there is a maze that results such that its string contains a binary representation of any position other than 0 to 35, it must be discarded. Such a maze is illegal. F. Elitism The two fittest mazes in a generation are transferred unchanged to the next generation. This ensures that the fitness of the best maze in the next generation is at least as high as its previous generation. IV.
MAZE SOLVER
In order to check if a maze is solvable and compute the values of the parameters used in the fitness function a maze solver is necessary. This module uses a version of breadth first search to find a solution to a maze. The first solution found using breadth first search is the shortest solution to the maze. One of the parameters that the fitness function comprises of is the number of false paths – paths which lead to Minotaur capturing Theseus. In order to find the number of false paths it is required to consider all possible paths that the user can take. Since in order to find the number false paths all possible combinations must be considered, the A* algorithm is not applicable. Breadth first search is used to find a solution. All possible moves that can be made at each cell position are considered. However, to avoid visiting previously considered states the program keeps track of the cell locations that Theseus and Minotaur have visited using the visited matrix.
3
ERTAI-2010
Artificial Intelligence in Gaming
Initially a queue contains the starting maze state. This is popped and the five possible moves (left, right, down, up, and no move) that Theseus can make are considered. The valid moves are put into the queue. The elements of the queue are then popped and evaluated in a similar manner. The algorithm terminates when the queue that stores the states that must be visited becomes empty. At termination either a solution is found and all false paths are exhausted or a solution is not found. Mazes for which solutions are not found are declared as unsolvable. Since visited states are considered only once after which they are never put into the queue, the time complexity of the maze solver is proportional to the maximum number of possible states of the maze. A state is represented by the location of Theseus and Minotaur. Let ‘n’ be the number of cells in the maze which is equal to rows * columns. Theseus can take ‘n’ distinct positions and Minotaur can take ‘n’ distinct positions. Therefore, there are n * n number of distinct states that are evaluated in the worst case assuming that all cell positions are reachable. Thus, the time complexity of the maze solver is O(n2). Since all states must be evaluated to find the number of false paths, the time complexity is always O(n2). As an example, consider a maze of 10 rows and 10 columns. It has a total of 100 cell positions or n = 100. Each time the maze solver is invoked for a maze with n = 100, there are 10,000 distinct states of the maze that will be checked. Using the parameters found after an exhaustive breadth first search, the fitness of the maze is calculated. V.
RESULTS
Mazes generated using the following parameters are represented below. Horizontal Length = 6 Vertical Length = 6 Constant Population Size = 100 Probability of Horizontal Wall Generation = 0.25 Probability of Vertical Wall Generation = 0.15 Elitism Implemented Fitness = a1*k1 + a2*k2 + a3*k3 + a4*k4 + a5*k5 a1= number of player moves k1 = 2 * 100 / max. value of number of player moves in population a2 = number of enemy moves
k2 = 3 * 100 / max. value of number of enemy moves in population a3 = number of static enemy moves k3 = 1 * 100 / max. value of number of static enemy moves in population a4 = number of blocks faced by enemy k4 = 1.5 * 100 / max. value of number of blocks faced by enemy in population a5 = number of false paths k5 = 1 * 100 / max. value of number of false paths in population Table I represents the values of the parameters that determine the fitness of the fittest maze of a given generation along with its fitness value. The fitness values presented in Table I were generated by placing the fittest maze of each generation in a separate population. After reaching the last generation, the fitness of the mazes in this population was calculated using the maximum values of each parameter in this population as the basis for normalization. Figs. 1-6 show the mazes with the highest fitness values in their corresponding generations. It can be seen that the walls gradually change from generation 0 to generation 1760. The fitness maze of generation 1760 is the most evolved. The improvement of the fitness of the fittest maze can be seen in Fig. 7 which graphically represents the fitness of the maze v/s the generation number. VI.
FURTHER WORK
This project has been modified to generate mazes for a modified version of Theseus and Minotaur. This version includes a concept of moving blocks which are blocks placed in different positions on the maze. With each move of Theseus and Minotaur they move along a predefined path. A block in a particular position prevents Theseus or Minotaur from moving to that position. Additionally, if a Theseus or Minotaur moves to a particular position and in the next move a block moves to that position, the corresponding player is killed. The movement of the blocks is periodical along fixed paths and therefore a player can look ahead and determine the moves needed to get to the exit. This implementation still requires testing.
Figure 1. TABLE I
Generation
Fitness of Fittest Maze
Player Moves
Normalized Number of Enemy Moves
Normalized Number of Times Enemy must be Blocked
Normalized Number of Static Enemy Moves
Normalized Number of False Paths
0
177.5
19
27
14
11
28
320
753.5
89
92
86
77
98
640
785
89
97
84
88
100
960
784.5
92
97
89
83
96
1280
793
96
97
97
94
72
1760
816
98
100
98
100
72
4
ERTAI-2010
Figure 1. Fittest Individual in Generation 0.
Artificial Intelligence in Gaming
Figure 4. Fittest Individual in Generation 960.
Figure 2. Fittest Individual in Generation 320.
Figure 5. Fittest Individual in Generation 1280.
Figure 3. Fittest Individual in Generation 640.
Figure 6. Fittest Individual in Generation 1760.
5
Fitness of fittest maze
ERTAI-2010
Artificial Intelligence in Gaming
900 800 700 600 500 400 300 200 100 0 0
320
640
960
1280
1760
Generation Number
Figure 7. Graph showing the fitness of the fittest maze v/s the generation number. ACKNOWLEDGMENT Jeetendra C. Nihalani, Gokul Chandrasekaran, Rohan Narang and Mahsheed Z. Eshraghi thank Ms. Shalini Bhatia for having sincerely guided us during the implementation of this project. They also thank Mr. Robert Abbott, inventor of the game Theseus and Minotaur, for allowing the use of his game in the project and for having provided invaluable inputs related to the difficulty of mazes generated. REFERENCES [1] [2]
[3]
[4]
[5]
[6]
[7]
Russell, Stuart and Norwig, Peter, “Artificial Intelligence A Modern Approach,” s.l.: Dorling Kindersley, 2009. p. 122. Danyluk, Andrea Pohoreckyj, “CS 108 - Lecture 32”, Computer Science at Williams College Web site, [Online]. [Cited: October 21, 2009] http://www.cs.williams.edu/~andrea/cs108/Lectures/InfSearch/i nfSearch.html. “CS 2360,” Computer Science at Georgia Tech Web site, [Online]. (May 30, 1996) [Cited: October 22, 2009.] http://www.cc.gatech.edu/computing/classes/cs2360/spring96/le c20.html. Vogt, Andrew and Bared, Joe G., "Accident Models for TwoLane Rural Roads: Segments and Intersections." US Department of Transportation - Federal Highway Administration Web site. [Online]. (1998) [Cited: October 22, 2009] http://www.tfhrc.gov/safety/98133/ch02/body_ch02_05.html. Delgado, Tony "Tablesaw", “COLUMN: 'Beyond Tetris' Theseus and the Minotaur / Mummy Maze,” Game Set Watch, [Online]. (Febraury 12, 2007) [Cited: March 6, 2010] http://www.gamesetwatch.com/2007/02/column_beyond_tetris_ theseus_a_1.php. Chaisilprungrueng, Suchao and Moradi, Hadi., "Computer Science: University of Southern California." University of Southern California Web Site, [Online]. [Cited: July 1, 2009] http://www.cs.usc.edu/Research/techreports/papers/03-800.pdf. “Crossiover (Genetic Algorithm),” Wikipedia, [Online]. [Cited: October 8, 2009] http://en.wikipedia.org/wiki/Crossover_(genetic_algorithm).
6
ERTAI-2010
Role of technology in assisting children with Developmental Disorders
Role of technology in assisting children with Developmental Disorders Harini Sampath, Jayanthi Sivaswamy, Bipin Indurkhya International Institute of Information Technology Hyderabad, India [email protected], [email protected], [email protected]
Abstract— Many fields of Artificial Intelligence including natural language processing, speech processing, robotics etc., have seen explosive growth in the last decade. In this paper, we discuss the application of these advances to aid children with autism and dyslexia. This paper describes two systems we are currently developing - an alternative and augmentative communication (AAC) system for children with autism and a speech modification system for children with dyslexia. We also present results of our initial evaluation. Keywords-Autism; Dyslexia; Alternative and Augmentative Communication (AAC) systems; Text to Picture Conversion
I.
INTRODUCTION
The applications of artificial intelligence in the fields ranging from arts to business are well known. A lesser investigated problem, nonetheless gaining momentum in recent years, is the application of AI in developing systems to improve the quality of life for individuals with cognitive disabilities. These systems are known as Assistive technologies for cognitive rehabilitation or cognitive prosthesis. As the name suggests, these are technological interventions meant to improve the functional capabilities of individuals with cognitive disabilities. These devices could range from simple alarms to social robots. The development of such assistive technologies brings in an additional dimension to system design – the human factor. Since assistive technologies are always targeted at a particular group of individuals with a unique set of abilities and disabilities, researchers need to consider factors such as the cognitive profile of the group and usability issues specific to that group while designing assistive systems. Also the evaluation of these systems should be done with human subjects. Thus developing assistive technologies requires expertise in domains of both artificial intelligence and cognitive psychology. In this paper, we describe two systems – an assistive reading system for children with Dyslexia and an alternative and augmentative communication system for children with Autism. The rest of the paper is organized as follows. We first provide a brief introduction of Dyslexia and Autism. We then describe the assistive systems we are developing and describe the AI challenges involved. II.
DYSLEXIA AND SPEECH MODIFICATION
Developmental dyslexia is an unexpected difficulty in reading in children and adults who otherwise possess adequate intelligence and motivation. The prevalence rate is estimated to be around 5% to 10% in school age children. Dyslexia can be comorbid with language difficulties, writing and mathematics disorders.
The goal of reading is to acquire meaning from text. Reading involves parsing the text and mapping it to an internal representation of sounds (except in users of sign language) and meaning in the brain. One possible cause of dyslexia is the improper internal representation of these sounds. The exact cause of this improper representation is still a matter of debate [1]. It has been argued that this improper representation could be due to more fundamental deficits in auditory perception. A variety of problems in auditory perception has been reported in literature - difficulties in backward masking, temporal order judgment, perceiving amplitude and frequency modulations and perception in noise [2, 3, 4]. These difficulties affect speech discrimination, for example, the difference between two acoustically close phonemes like /ba/ and /da/. This leads to a poor representation of phonemes and hence creates difficulties in reading. A. Clear Speech and Related Work There exists some evidence that speech modification could aid children with dyslexia. Currently there is only one system, Fast ForWord [5] that uses speech modification as intervention for such children. Recent psychoacoustic studies show a particular style of speech known as clear speech enhances intelligibility for children with dyslexia [6] and more generally for those who have difficulties in hearing [7]. Clear Speech is defined as the style of speech that results when one attempts to make his/her speech more intelligible. Speakers resort to clear speech when they think listeners do not have knowledge of language or have some hearing impairments or the background is noisy. There is no clear consensus on those right set of features that differentiate clear speech from conversational speech. Some of the proposed features are increased consonant-vowel (CV) ratio, increased vowel space, slower rate of speaking, increased pitch range, lesser co-articulation and salient stop releases [8, 9]. B. Experiment In this work, we consider three features of clear speech - duration, consonant-vowel ratio and vowel formant space. We are currently studying the effects of the modifying each of these features on children with dyslexia. In the next section, we describe an experiment which shows the effects of altering the duration of speech on children with dyslexia. C. Participants Seventeen participants were recruited from the fifth grade of a primary school. The average age of the students was 10 years and all of them had English as their second language in school. All the students were initially assessed
1
ERTAI-2010
for reading difficulties by a clinical psychologist. This initial screening was based on the feedback of their general performance in the class and their performance in a spelling test. This test required the subjects to spell 10 simple English words. Based on their performance on these two evaluation criterion, four subjects (3 boys and 1 girl) were chosen for the study. D. Method The stimuli consisted of ten English words containing on an average four syllables per word. All words were recorded in a single female voice. The recording was done at a sampling rate of 44.1 KHz. All words were chosen to be above the level of the children, so that we could ensure that they were indeed spelling the words from what they were hearing rather than from memory. These words were then slowed down by 50%. Participants were individually administered the normal speech and slowed down speech stimuli. The order in which the normal and clear speech was presented was randomized. They were again asked to spell the words they heard. E. Results The responses of the 4 children were collected. Comparison was made at the syllable level and the number of syllables the children had spelled correctly was identified. The mean was computed for both the normal and slowed down stimuli. The results are shown in Fig. 1.
Fig. 1. Mean scores with normal and rate modified speech F. Discussion The results indicate that the perception of syllables with slowed down speech. In this experiment we studied the effect of slowing down speech in the auditory perception of children with dyslexia. Further experiments will be conducted to establish the effect of increasing the consonant-vowel ratio and the vowel space. These factors we believe will improve auditory discrimination for children with reading difficulties.
Role of technology in assisting children with Developmental Disorders
III.
ASSISTIVE READING SYSTEM
Text to Speech conversion is a relatively matured branch of speech research. But the quality of speech produced may not be suited for all target groups. Listeners with profound hearing difficulties as in case of sensorineural hearing loss or mild difficulties as discussed above in case of Dyslexia might require a different quality of speech output depending on their acoustic profile. While it might be extremely difficult to make such modifications in natural continuous speech, it is relatively easier to manipulate factors such as time scale or to selectively process certain phonemes, in synthesized speech. We are currently developing an assistive reading system. This system incorporates two components – a text to speech system to render speech in a form suited to the auditory profile of dyslexia as discussed above and the second component is to display the pictures corresponding to the word being read. While the first component facilitates text to sound access, the second gives an easy text to meaning access, thus making reading more accessible for children with dyslexia. IV.
AUTISM AND ALTERNATIVE AND AUGMENTATIVE COMMUNICATION SYSTEMS
A. Communication difficulties in Autism Autism is a developmental disorder of neural origin characterized by lack of social interaction, poor verbal and non-verbal communication and stereotyped interests. The language and communication skills of children with autism (CWA) vary widely. Some children have no functional communication, some are echohalic with extremely limited comprehension and some children do develop language but cannot understand abstractions such as idioms, metaphors and stories [10]. They also have difficulty in abstraction and generalization but are excellent visual thinkers. B. Related Work Augmentative and Alternative Communication (AAC) devices such as PECS and VOCA have been used with children with autism [11] where children can choose pictures to construct sentences. Historically, these devices were developed to aid children in expressing themselves (expressive communication) but not in understanding other’s speech (receptive communication). However in case of autism, where a child has difficulty with both expressive and receptive communication, these devices do not complete the communication process as they assist only in expressive communication but not in receptive communication. The input modality is not visual and so the child has to still understand the care giver’s language to communicate. In our work we propose to develop an AAC that rectifies this and hence is more specifically suited to Autism. Creating a system that could convert text to pictures as well as pictures to text requires many aspects of natural language processing as well as opens up new avenues for research. In the next section, we describe the various components of the proposed system and then discuss the technical issues involved.
2
ERTAI-2010
Role of technology in assisting children with Developmental Disorders
V.
FEATURES OF PROPSED AAC
A. Two way communication The AAC should aid in both expressive and receptive communication. We are building a device that could convert language to pictures and pictures to language and hence complete the communication loop. The communication chain is as shown in Fig.2 and 3. The care giver has a device which could send out a text message to the child. This text message would be received by the handheld device the child has. This device would have a mechanism to convert this text into a set of relevant pictures describing it.
D. Reuse In typical communication we reuse many sentences. But users of AAC do not have this advantage. They have to produce the same effort every time they want to repeat a construct. To avoid this, the device would identify some of
Fig. 3. Receptive communication in AAC for children with autism.
Fig. 2. Expressive communication in AAC for children with autism. Rendering the language as pictures removes the need for abstraction from the child’s part and takes advantage of their strong visual thinking skills. To communicate back, the device would present the child with a set of pictures. They can choose the pictures that would describe their needs. An appropriate sentence would be constructed and would be synthesized using a text to speech system and spoken back to the care giver. B. Intuitive symbol set Most AACs use pictures derived from sources such as Picture Communication Symbols (PCS) which were designed for use in static cards. Many of these symbols are very abstract for a CWA to understand and do not suit the Indian context. The best symbol set suited for CWA should represent most part of its core vocabulary using symbols that are intuitive to the child. This can be achieved by using a combination of both pictures and animations as some concepts could be more easily represented using actions and allowing the children to add their own pictures enables us to not only increase their picture vocabulary over a period of time but also to personalize the symbol set to a particular CWA. . C. Intelligent user interface The pictures in the AAC would be related using a semantic network. This would ensure that once the child selects a picture, only related pictures would appear. This network will be improved based on the user choices. This ensures that the device adapts to the child over a period of time.
the most commonly used constructs and let users reuse them. The AAC would be evaluated in a scenario where the child is interacting with the care giver. The parameters for evaluation would be based on the improvement in number of times the child initiates the communication, the increase in comprehension and if the child’s ability to generalize the communication scenario improves. The next section describes the two key components in the AAC. VI.
PICTURE TO TEXT AND TEXT TO PICTURE CONVERSION SYSTEMS
A. Picture to text conversion This is the key component of the expressive communication system. In AAC systems, this is done by using a fixed vocabulary symbol system. The vocabulary is decided by interacting with children and understanding their communication goals. The vocabulary size of current AAC systems ranges from 2000 to 5000. The holy grail of AAC research is to achieve user’s communication goals with minimum keystrokes. Given the limited number of symbols that could be displayed on a screen, the amount of navigation the child needs to perform to construct a sentence needs to be minimized. Multiple approaches have been tried to address this problem including the use of Semantic Networks [12]. B. Text to Picture conversion Given the text input, the AAC system need to be able to display a set of pictures corresponding to it. This involves parsing the text, identifying keywords that would be rendered as pictures and deciding the relationships among the pictures relationships among the pictures based on the relationship among the selected keywords. VII. CONCLUSION In this paper, we described two systems that we are currently developing for children with Autism and Dyslexia. Both these systems make use of components such as text to speech synthesis and natural language processing. Many other applications are possible like using head tracking as alternate input mechanism for computers
3
ERTAI-2010
Role of technology in assisting children with Developmental Disorders
when children have developmental motor disabilities like cerebral palsy. This is a promising research area, where understanding of cognitive disorders and knowledge of technology can go a long way in improving quality of life of individuals with disabilities.
[6]
[7]
ACKNOWLEDGMENT We thank SAHAAI, IIIT-Hyderabad for its support for this project. We thank Dr. Kiranmayi, Deeksha foundation for her support in the experiments. We thank the children and their teachers who participated in the experiments for their support. REFERENCES [1]
[2]
[3]
[4]
[5]
Franck Ramus and et al, “Theories of developmental dyslexia: insights from a multiple case study of dyslexic adults,” Brain, vol. 126, pp. 841–865, 2003.J. Clerk Maxwell, A Treatise on Electricity and Magnetism, 3rd ed., vol. 2. Oxford: Clarendon, 1892, pp.68– 73. Paula Tallal, “Auditory temporal perception, phonics,and reading disabilities in children,” Brain and Language,vol. 9, pp. 182–198, 1980.K. Elissa, “Title of paper if known,” unpublished. C. Witton, J.B. Talcott, and et al, “Sensitivity to dynamic auditory and visual stimuli predicts nonwordreading ability in both dyslexic and normal eaders,” Current Biology, vol. 8, pp. 791–797, 1998. J.C. Ziegler, C. Pech-Georgel, F. George, and C. Lorenzi, “Speechperception-in-noise deficits in dyslexia,” Developmental Science, vol. 12, 2009. Srikantan S. Nagarajan and et al, “Speech modifications algorithms used for training language learning-impaired children,” IEEE
Transactions on Rehabilitation Engineering,vol. 6, pp. 257–268, 1998. A.R. Bradlow, N Kraus, and E Hayes, “Speaking clearly for children with learning disabilities: sentence perception in noise,” Journal of Speech, Language, and Hearing Research, vol. 6, pp. 80–97, 2003. M. A. Picheny, N. I. Durlach, and L. D. Braida, “Speaking clearly for the hard of hearing I: Intelligibility differences between clear and conversational speech,” Journal of Speech and Hearing Research, vol. 28, pp.96–103, 1985.
[8]
J. C. Krause and L. D. Braida, “Acoustic properties of naturally produced clear speech at normal speaking rates,” Journal of Acoustical Society of America, vol.115, pp. 362–378, 2004.
[9]
M. A. Picheny, N. I. Durlach, and L. D. Braida, “Speaking clearly for the hard of hearing II: Acoustic characteristics of clear and conversational speech,” Journal of Speech and Hearing Research, vol. 29, pp. 434–446,1986.
[10] Isabelle Rapin and Michelle Dunn, “Update on the language disorders of individuals on the autistic spectrum,” Brain and Development, vol. 25, pp. 166 – 172, 2003.. [11] Pat Mirenda, “Toward functional augmentative and alternative communication for students with autism: Manual signs,graphic symbols, and voice output communication aids,” Language, Speech, and Hearing Services in Schools, vol. 34, pp. 203–216, 2003. [12] Sonya Nikolova, Jordan Boyd-Graber, Christiane Fellbaum and Perry Cook, Better Vocabularies for Assistive Communication Aids: Connecting Terms using Semantic Networks and Untrained Annotators, ASSETS, Eleventh International ACM SIGACCESS Conference on Computers and Accessibility, Pittsburgh, 2009.
4
ERTAI-2010
RoboCup SSL Team Description, IRL RC
RoboCup SSL Team Description, IRL RC B. Sujith Kumar1, P. S. Abhimanyu2, B.V.V.P Bharagav3 Electronics and Communications Engineering IIIT, Hyderabad Hyderabad, India 1 [email protected], 2 [email protected], 3 [email protected]
Pratik Agarwal Department of Computer Science and Engineering Manipal Institute of Technology Manipal, India [email protected]
Dr. K. Madhava Krishna Robotics Research Center IIIT, Hyderabad Hyderabad, India [email protected]
Abstract— This paper describes the robotic system of IRL RC (IIIT-H Robotics Lab RoboCup) team which participated in RoboCup,09 India. The whole system is divided into three main parts: Mechanical, Electrical and Software systems. A summary of all these systems is given in different sections. Path planning for mobile robotics has always been of interest to researchers. Rapidly Exploring Random Tree (RRT) has been used in the past decade extensively for path planning. Its advantage is obviously generating a path in the most randomly distributed environment. This paper also gives an algorithm for generating a smooth path using RRT in an environment containing rapidly moving obstacles. Keywords- AI, RRT, Path Planning, Robot Soccer, Multi Agents.
I.
INTRODUCTION
Robotics is an area where many technologies and fields can be merged. It also develops the ability to solve real time problems. In this paper we explain our robotic system that was developed for RoboCup Small Size League (SSL). RoboCup Small Size League is a competition where a team of five autonomous soccer playing robots compete with the other team and it was introduced for the first time in India as a national wide competition in 2009 by IIT-Kharagpur. This paper focuses on our SSL robotic system (past and present) touching upon many relevant aspects. This paper is organized as follows: Section II describes the mechanical system of the robot. Section III describes the electrical and electronic elements involved. Section IV describes the software system focusing on two important parts namely the vision system and the decision system. Section V discusses our path planning algorithm. Section VI deals with our current work.
II. MECHANICAL SYSTEM The robot is omni directional with four custom made Swedish wheels. Any two wheels are separated by an angle of 120oor 60o. Each wheel contains sixteen symmetrically aligned rollers and each roller is parallel to the axis of rotation of the wheel to give an extra degree of freedom. These wheels are attached to stepper motors of the type 16PU-M301-G1. The robot's CAD is given in Fig. 1 and the original robot's picture is given in Fig. 2. The robots well fit into the restrictions of SSL rules. Each robot has a diameter of 179 mm and a height of 150 mm. III. ELECTRICAL SYSTEM The central controller consists of one ATMega16 and one ATMega8 microcontrollers. ATMega16 communicates with the Maxstream XBee Seies 2 modules at a baud rate of 9600 bps. These modules are low power RF devices which work on Zigbee protocol. Each of the RF devices is embedded in each bot and they communicate with the XBee module connected to the Decision System. The command from the Decision system to the bot contains the information about the speed of each wheel, dribbling on/off and kicking on/off. ATMega8 generates
Figure 1. CAD model of the robot
1
ERTAI-2010
RoboCup SSL Team Description, IRL RC
Decision System. They communicate with each other using UDP through socket programming. Their descriptions are given below.
Figure 2. Omni directional robot
Figure 3. Central controller
required signals for the dribbling and kicking. It acts a slave to the main controller ATMega16. The stepper motors are driven by L298-L297 pairs. The driver circuit for the stepper motor consists of four L298-L297 pairs connected to ATMega16. The servo motor and the dribbler roller motor are driven by one L293D connected to ATMega8. Our custom made double layered PCBs for the central controller and the driver circuits are given in Fig. \ref{pcb}. The whole electrical system is driven by a 11.1 V, 2200 mAh Li-Po battery. To cool the electrical system we use a CPU fan which consists of 12V DC brushless motor. IV.
SOFTWARE SYSTEM
Our software system is divided into two different entities; one is Vision System and the other one is
A. Vision System We are using Unibrain Fire-I firewire digital camera (IEEE 1394) which supports 30 fps in the mode YUV 4:1:1 with a resolution of 640 X 480. We are using openCV open source library for all the vision process. Our algorithm first finds the centers of the robots and the ball in a frame using thresholds in YUV space discarding the Y channel to compensate the effects caused by the illumination changes. Our algorithm is based on the algorithm described in [1] which can determine the ids of the robots and their orientations robustly. After finding the center of each robot the vision system has to identify the bot's id using to the extra colored markers. These center markers of are used to detect the robots. Example of different robot tops are shown in Fig. 4. It also has to determine the orientations. We've implemented an algorithm which can do both these jobs at a shot. It involves the following steps: • Taking circular data around the center markers and converting them into 1-dimensional signals • Identifying the bots using Minimum of Minimum Mean Squares • Determining the orientation of each robot using Minimum Mean Squares obtained in Step2. This system sends the bots' positions, their orientations and the ball's position to the Decision System using UDP socket programming. We've chosen UDP because it is connectionless and it does not do error correction and flow control as in TCP which is slower compared to UDP. B. Decision System The decision system receives the information, described above, from the vision system and sends appropriate commands to the robots wirelessly. Our decision system along with the controlling of the on field bots is divided into four layers. The bottom layer, layer 1, consists of sending particular control commands, like frequency of the stepper motor steps, to the bots using serial communication and the XBee devices. Next level layer, layer 2, uses the layer 1 and sends particular control commands like setting the wheel velocities independently. Layer 3 consists of commands which are used to move the robot at a particular speed in a particular direction with particular angular velocity about its center. The strategic part is in Layer 4. The advantage of doing this is that even if we change the robot's type (differential drive or omni directional) we have to change only the Layer 1 while the remaining layers will remain intact. V. RRT
Figure 4. Markers on top of each robot
Rapidly Exploring Random Tree (RRT) [2] has been used in the past decade extensively for path planning. Its advantage is obviously generating a path in the most randomly distributed environment. Below we have explained the RRT algorithm and modified it for generating a smooth path using RRT in an environment containing rapidly moving obstacles. We have generated this algorithm for mainly assisting us in path planning for our non holonomic robot for
2
ERTAI-2010
RoboCup SSL Team Description, IRL RC
RoboSoccer. RRT [2] plans a path according to the present situation of obstacles. So for a dynamic environment such as roboSoccer where robots can move with a velocity higher than 5 m/s in a confined region of 6m by 4m the best way to find an obstacle free path is to generate the path repeatedly after every fixed interval of time. There are techniques such as using goal bias [3] and a way point bias [6] to reduce the computation but it is redundant to repeatedly generate the path. [6] had addressed the problem of path planning in dynamic environment using RRT by introducing a waypoint cache and by implementing a beta search but it does not reduce the burden of generating the paths repeatedly. Our algorithm improves the planning significantly for robots in a rapidly changing dynamic environment significantly. RRT inherently gives us a path which is based on random points and hence the path achieved will always be very zigzag in nature. We present here an algorithm which minimizes the generation of new paths and it also smoothen the path followed by the robot. Our algorithm computes the path using a way point cache but only when the current path is not feasible. It then tells the robot to go farthest point on the computed path till which it can go in a straight line without hitting any obstacles according to current situation. After each time interval it computes the next farthest point which is possible to reach. Once it reaches this point it recomputes the path similar to way point cache method as described by [6]. A. RRT Goal Bias In a naive RRT path generation we find a random point on the map and find the nearest node on our tree to this point. Initially the tree contains only the start point as the root node. We attach a new node to this closest node of the tree if it is possible to move by a fixed distance (METRIC) in the direction of this random point from our closest node. This is the most general RRT algorithm. Next a goal bias was introduced to bias the path generated to the destination. Here we generate the path with with the goal as probability p and random points with a probability 1-p. The algorithm for this is mentioned in Fig. 5. B. RRT Waypoint Bias This goal biased RRT algorithm was further improved by [6] by implementing a waypoint cache. They added a third probability and hence for re-planning the tree extended by a probability p towards a random point, a probability q towards the goal and a probability of 1-p-q towards a randomly chosen waypoint which is a point on the previous path generated. They re-compute the path after each time interval. C. Modified Smooth RRT We use the same algorithm as described by [6] for generating the path for the first time or if the destination changes. We introduce a point called the current destination. Current destination is the farthest point on the generated path till which the robot can go in a straight line without a collision. We will move the robot with a velocity so that it can reach this point. The advantage of this is that too many turns are avoided. At each time interval we just need to update the current destination and assign the required velocities to the robot.
Figure 5. Algorithm for goal biased RRT
Figure 6. Algorithm for waypoint biased RRT
In Fig. 7 the path in red shows the actual path generated by the RRT algorithm. The black circle is the farthest point where the robot can reach by traveling in a straight line without hitting any obstacle in the current situation. We give this point as the current destination. As the situation changes the current destination will also change. At each time cycle we find the farthest current destination which can be achieved according to the current configuration. We use this generated path repeatedly, till we can reach the required destination. When a collision is detected the path is recomputed. The velocities required to reach this current destination (black circle) is calculated and achieved by the robot.
3
ERTAI-2010
RoboCup SSL Team Description, IRL RC
The first obvious advantage of this method is that the path traversed by the robot is smoother. The robot is made to travel in a straight line as much as possible. The second advantage is that regeneration of path is minimized as much as possible. In the future it might be possible to go from a certain location to another which currently is obstructed. By using this method we may be able to direct a shortcut from one way point to another though the path generated on the RRT does not have it. [5] uses a similar technique to smoothen the path generated from RRT by putting the waypoints on a stack. It converts this path into line segments which reduces the jaggedness into few straight lines. But the algorithm is for static obstacles. We do not need to convert the whole path into few straight lines because we never end up traveling the generated path to the end as the environment changes dynamically. VI. SIMULATOR The simulator is a very important piece of software written in QT. It helps us simulate a game. The wireless module which in a real situation would communicate with the robots thinks it is still communication with the robot but instead it communicates with the simulator. The simulator has a physics engine which finds out if the robot undergoes a collision with the ball or with the wall or with other robots. It simulates a real environment as close as possible. The results were tested on the simulator. The simulator was written to test AI and strategy module and all the other codes without actually causing any harm to the hardware. It simulates the collisions of robots with robots and the robots with the ball. When a ball collides with robot the simulator predicts the new velocities of the ball depending upon the parameters of the collision. The simulator was also used to test the path planning algorithm as shown in Fig. 7. The simulator receives the commands sent by the controller for the actual robots and simulates the environment. We send the velocities of the various robots and the simulator predicts the next position or performs collision restitution. In a real situation the strategy module would only interact with the vision module for data but for testing purposes we use the simulator. VII. CURRENT WORK Currently we are working on improving the hardware. On the software part we are working towards coming up with an intelligent strategy to play soccer. We will be using a Play and skill based strategy. Play represents the high level strategy. It can be compared to the formations we have in a normal game of soccer. In our method each robot will be assigned a role, where a role can be defender, attacker goalie etc. Since each of our robots are equally capable of performing each role the roles are dynamically assigned to each robot. Each robot can perform a group of skill. Each robot is assigned skills to be performed for the role assigned to it. For example the goalie may be assigned to keep tracking the ball and minimizing the open area for the goal. The attackers may be assigned to shoot if they are open or move to an open position. We can have various plays and various skills which we are still experimenting with. This technique is based on the technique used by the Cornell team [7].
Figure 7. RRT on simulator
REFERENCES [1]
[2] [3]
[4]
[5]
[6]
[7] [8]
[9]
[10]
[11] [12] [13]
Shichi Shimizu, Tomoyuki Nagahashi, and Hironobu Fujiyoshi, “Robust and Accurate Detection of Object Orientation and ID without Color Segmentation”. S. M. LaValle, Planning Algorithms. Cambridge University Press,2006. S. LaValle, “Rapidly-exploring random trees A new tool for path planning”, Iowa State University, Dept. of Computer Science,Tech.Rep. 9811, 1998. Zoltan Deak Jnr., Professor Ray Jarvis, “Robotic Path Planning using Rapidly exploring Random Trees” Intelligent Robotics Research Centre Monash University. S. M. LaValle, J. J. Kuffner, and Jr., “Randomized kinodynamic planning,” The International Journal of Robotics Research, vol.20, no. 5, pp. 378400, 2001. Bruce, J., Veloso. M, “Real-time randomized path planning for robot navigation”. In Proceedings of IROS-2002, Computer Science Department (2002) Raffaello D'Andrea., Tamas Kalmar-Nagy., Pritam Ganguly., Michael Babish, “The Cornell RoboCup Team”. Veloso, M., Bowling, M., Achim, S., Han, K., Stone. P, “The CMUnited-98 champion small robot team” In RoboCup-98: Robot Soccer World Cup II, Springer Verlag (1999) Browning, B., Bruce, J.R., Bowling, M., Veloso. M, “STP: Skills tactics and plays for multi-robot control in adversarial environments”. In Journal of System and Control Engineering. (2005) Extended Team Description for RoboCup 2009, B-Smart. Deutsches Forschungszentrum fur Kunstliche Intelligenz GmbH, Germany. CMDragons 2009 Extended Team Description, Carnegie Mellon University, USA. Plasma-Z Extended Team Description, Chulalongkorn University, Thailand. Michael Bowling, Brett Browning, Allen Chang and Manuela Veloso, “Plays as Team Plans for Coordination and Adaptation”
4
ERTAI-2010
DNA Sequence Reconstruction Based on Smarter GA and Best Rearrangement
DNA Sequence Reconstruction Based on Smarter GA and Best Rearrangement Shubhashis Sengupta West Bengal University of Technology BF-142, Sector-1, Salt Lake, Kolkata, India [email protected]
Abstract— Recent advancements in the biological sciences have created new problems for computational biology. One such problem is determining the sequence of DNA. One of the options for this process is using Sequencing by Hybridization experiment to obtain the information from the physical structure of DNA. Various methods have been proposed to reconstruct DNA sequences from SBH data, including hybrid genetic algorithms. This paper describes a genetic algorithm which is supposed to be more robust than previous methods. We can say that robust genetic methods may offer techniques to further improve on current methods. Keywords— DNA sequencing, Sequencing by hybridization, SBH, Genetic algorithm, Hybrid GA, Computational Biology, population, fitness, mutation
I. INTRODUCTION DNA molecule, the fundamental fabric of life carries the genetic instruction for making living organisms. The DNA segments that carry this genetic information are called genes, but other DNA sequences have structural purposes, or are involved in regulating the use of this genetic information. Chemically, DNA is a long polymer of simple units called nucleotides, which are held together by a backbone made of alternating sugars and phosphate groups. Attached to each sugar is one of four types of molecules called bases. The bases are adenine (A), thymine (T), cytosine (C) and guanine (G). It is the sequence of these four bases along the backbone that encodes information. In chemical laboratory methods of DNA sequencing, a long DNA strand is chopped into slices and then biochemical method is applied to detect the nucleotide sequence in a single fragment. Then these known fragments are used to form a superstring, which is almost the same as the original DNA strand. The limitations of the conventional DNA sequencing methods can be overcome in the hybridization technique with the help of genetic algorithm; where in a single biochemical experiment (using a DNA probe chip) is possible to know almost all the sub sequences of a specified length, which are presented in the original sequence. Our goal is to find an efficient way for DNA sequencing using GA. II. PROBLEM DESCRIPTION DNA is formed from two monomer chains. These chains consist of four types of nucleotides, or bases, called adenine, thymine, cytosine and guanine (A, T, C and G). The monomer chains are held together by hybridization between complementary pairs of bases. Adenine is always a complement to thymine (A and T). And cytosine is a complement to guanine (C and G).
This complementary nature of DNA is fundamental to the SBH method of sequencing DNA. A. The Sequencing by Hybridization (SBH) Problem The method of SBH consists of two steps. The first step is a chemical experiment based on the principle of complementary hybridization of two single-stranded DNA chains. An oligonucleotide library is prepared on a DNA chip. The library consists of short, different oligonucleotides (probes). During the experiment, these probes are compared with many copies of a fluorescently labelled DNA sequence (called the target sequence). If one probe and one fragment of the target sequence are complementary, a fluorescent signal will be emitted in their complementary places. As a result of the experiment one gets a set of probes that compose the DNA sequence. This set is called a spectrum. In the second step of the SBH, an algorithm reconstructs a sequence from the spectrum. Two types of errors may be introduced along the hybridization experiment: • Negative errors: Some probes contained in the target sequence do not appear in the spectrum. • Positive errors: Some probes not contained in the target sequence appear in the spectrum. B. Existing Heuristic Methods for DNA Sequencing Blazewicz et al. [5] proposed a heuristic algorithm based on tabu search. It only produces near-optimal solutions. They introduce a strategy of performing moves on a cluster. Later, Blazewicz et al. [6] proposed an overlapping window heuristic based on the same method. The solution is composed from the most probable segments created by maximization of a thickness function. Blazewicz and Kasprzak [7] proved that, when the spectrum contains errors, the problem of SBH is NP hard. Later, Blazewicz et al. [8] proposed a tabu and scatter search combined algorithm, which has the same criterion function with the one in [5]. Blazewicz et al. [9] proposed a hybrid genetic algorithm which got comparable results as the algorithm in [6]. Brizuela et al. [10] proposed a genetic algorithm which is effective by using a greedy crossover operator. Endo [11] presented a successful method using a TSP model and a genetic algorithm. The algorithm is to maximize the likelihood of assembling consecutive oligonucleotides after a sanitization process that can gets rid of the oligonucleotides causing positive errors. Bui and Youssef [12] proposed an enhanced genetic algorithm where they combine single oligonucleotides into clusters. Fernandes and Ribeiro [13] proposed a memorybased greedy randomized algorithm, also handling
1
ERTAI-2010
DNA Sequence Reconstruction Based on Smarter GA and Best Rearrangement
errors. This algorithm makes use of a memory structure based on the frequency of consecutive probes appearing together in a set of elite solutions. Blum et al. [14] proposed a constructive heuristic. The idea is to identify growing sub-paths in a completely connected directed graph that are most probably parts of optimal solutions. Later, Blum et al. [15] gave an ant colony optimization algorithm, which is a MAX-MIN ant system implemented in the hyper-cube framework. It is a very effective method. Nikolakopoulos and Sarimveis [16] proposed a heuristic algorithm originally introduced for the solution of asymmetric TSP. The approach differs from the one used in [20], because the objective is the minimization of the length of the produced array and not the maximization of the number of oligonucleotides from the spectrum. III. PROPOSED METHOD: SMARTER GA WITH BEST REARRANGEMENT (BR) A. Representation of Individuals Let, the original sequence N=GGACTGA , length of oligonucleotides, l=3 After hybridization, the following fragments are created GGA GAC ACT CTG TGA Let S={CTG,ACT,GGA,GAC,TGA} randomly]
[Taken
Each oligonucleotide is numbered: S = {CTG,ACT,GGA,GAC,TGA} 0 1 2 3 4 Any individual can be represented as any combination of 0,1,2,3,4, i.e., by 03421 we mean CTG GAC TGA GGA ACT. No of overlaps possible in this sequence is (fig 1): CTG GAC
G overlap TGA GGA
A overlap ACT
Total number of overlaps =2, fitness measured=2. Figure 1. A sample oligonucleotide overlapping sequence and fitness calculation.
Our aim is to maximize the overlap to get the original sequence with highest accuracy. For an error free sequence, highest number of overlaps possible is: |s|*(l-1) -2, i.e. Max overlap = no of oligos * (length of each oligo1) –2 B. The Algorithm: SGA with BR 1)
Input Oligo Length (L), population size (Popu),
Best rearrangement length (BRL). 2) Hybridize. Get the hybridized sequence (S). 3) Determine maximum fitness using the formula given below: MaxFit = (|S|-1) * (L-1). Length of the original sequence n = |S| + L -1 4) Build population of size Popu with unique sequences randomly. Calculate everyone’s fitness, full overlap size and position. Fitness = Total no of overlaps between the oligonucleotides of a given sequence. Generation = 1. 5) Display the highest fitness obtained with the corresponding sequence. Set the global fitness value to this. 6) Start evolution by setting StopEvolution = False & i=0. 7) While StopEvolution = False and solution not found do the following: a) While i is less than Popu do: Take the ith sequence and find the Best Rearrangement possible BRL times by fixing the full overlapped oligos and rearranging them randomly. If its Fitness reaches the MaxFit then print solution & exit. StopEvolution = True. If its Fitness is better than the global fitness then update the global fitness. Generation = Generation + 1. b) Take the sequence with highest fitness and apply intelligent mutation. If solution found, then print & exit. If obtained fitness is better than the global one, then replace it with the previous one and update global best. c) Repeat steps until all best sequences are undergone intelligent mutation. C. Best Rearrangement Maximum overlap of length l-1 is calculated and for each oligo pair and clusters of full overlapped oligos are taken as single unit (Fig. 2) and rearranged for faster convergence. D. Intelligent mutation It changes a particular oligo only if it contributes to full overlap and connects two separate oligos and acts as a missing link between them. Otherwise, it is mutated to suit the first or last position of the sequence, if possible. IV. EXPERIMENTS AND RESULTS The experiment is performed on a PC with a Pentium 4, 3 GHz, 512 MB RAM, and the Windows XP operating system. We use visual basic 6.0 language for performing this experiment. Table 1 shows the results of tests done with the proposed algorithm. The spectrum size is 100 to 500 individuals (the four nucleotide bases). The numbers of optimal solutions returned by the algorithm; out of 40 are shown in the
2
ERTAI-2010
DNA Sequence Reconstruction Based on Smarter GA and Best Rearrangement
solutions that were construct using the optimal number if oligonucleotides (optimal quality). Table 2 shows the results of tests done with the already published results namely tabu-search algorithm (1), the hybrid genetic algorithm (2), sequencing by
solutions, such as tabu search algorithm, genetic algorithm, ant colony optimization algorithm. But the given algorithms do not work in practice for much longer DNA sequences. We need to study algorithms further by improving existing algorithms or introducing new strategies. Future work consists of the following ways: • The decrease of errors from spectrum and correction of the reconstructing errors. • The application of parallel computing in SBH. The consumption of time and memory is large, especially when the probe becomes long. Because of the discrete nature of DNA, the data could be assigned in multi processors. Parallel processing will produce increase in efficiency. REFERENCES
Figure 2. A sample oligonucleotide with the occurrences of full overlaps. TABLE I. AVERAGE OPTIMAL VALUES OF THE PROPOSED METHOD Spectrum size
Optimal value
30 50 70 90
100% 100% 100% 99.9%
hybridization (3), DNA Sequence Reconstruction with Successor Choice (4) and our proposed method. TABLE II. COMPARATIVE RESULTS (SUCCESS RATE %) Spectrum size
TSS (1)
SBH (2)
HGA (3)
GASC (4)
Proposed method
100 200 300 400 500
100 95 77.5 52.5 45
97.5 92.5 77.5 47.5 37.5
100 95 50 22.5 12.5
100 95 80 55 50
100 97.5 80 62.5 60
All spectra used in the experiment are derived from real DNA sequences of human proteins using their accession numbers. V. CONCLUSION AND FUTURE WORK We have presented an effective and efficient method for DNA sequence reconstruction based on genetic algorithm. We introduced best rearrangement and intelligent mutation operation to improve the solution quality. Experimental results showed that the method can handle a long input sequence, yielding quite good quality output sequence. Heuristics have optimized
[1] J. Blazewicz, F. Glover, M. Kasprzak. DNA sequencing—— Tabu and scatter search combined. INFORMS J. Comput. vol. 16, pp. 232-240, 2004. [2] Ji-Hong Zhang, Ling-Yun Wu, Xiang-Sun Zhang. Reconstruction of DNA sequencing by hybridization. Bioinformatics, vol. 19, pp. 14-21, 2003. [3] J. Blazewicz, M. Kasprzak, W. Kuroczycki. Hybrid genetic algorithm for DNA sequencing with errors. Journal of Heuristics, vol. 8, pp. 495- 502, 2002. [4] Md. Rafiqul Islam, Md. Rowshan Shahriar, and Abul Faisal Mohammad Shaheed. DNA Sequence Reconstruction Based On Genetic Algorithm, Malaysian Journal of Computer Science, Vol. 21(1), 2008. [5] J. Blazewicz, P. Formanowicz, M. Kasprzak, W.T. Markiewicz, J. Weglarz. Tabu search for DNA sequencing with false negatives and false positives. European Journal of Operational Research, vol. 125, pp. 257-265, 2000. [6] J. Blazewicz, P. Formanowicz, F. Guinand, M. Kasprzak. A heuristic managing errors for DNA sequencing. Bioinformatics, vol. 18, no. 5, pp. 652-660, 2002. [7] J. Blazewicz, M. Kasprzak. Complexity of DNA sequencing by hybridization. Journal of Computational Biology, vol. 135, pp. 303307, 2003. [8] J. Blazewicz, F. Glover, M. Kasprzak. DNA sequencing—— Tabu and scatter search combined. INFORMS J. Comput. vol. 16, pp. 232-240, 2004. [9] J. Blazewicz, M. Kasprzak, W. Kuroczycki. Hybrid genetic algorithm for DNA sequencing with errors. Journal of Heuristics, vol. 8, pp. 495- 502, 2002. [10] C.A. Brizuela, L.C. Gonzalez, H.J. Romero. An improved genetic algorithm for the sequencing by hybridization problem. In: Lecture Notes in Computer Science. Springer, Heidelberg, pp. 11-20, 2004. [11] A.T. Endo. Probabilistic nucleotide assembling method for sequencing by hybridization. Bioinformatics, vol. 20, no. 14, pp. 2181-2188, 2004. [12] T.A. Bui, W.A. Youssef. An enhanced genetic algorithm for DNA sequencing by hybridization with positive and negative errors. Proceedings of the GECCO 2004——Genetic and Evolutionary Computation. Conference Lecture Notes in Computer Science, vol. 3013. Berlin, Germany: Springer, pp. 11-20, 2004. [13] E.L.R. Fernandes, C.C. Ribeiro. A multistart constructive heuristic for sequencing by hybridization using adaptive memory. Electronic Notes in Discrete Mathematics, vol. 19, pp. 41-47, 2005. [14] C. Blum, M.Y. Valles. New constructive heuristics for DNA sequencing by hybridization. Proceedings of WABI 2006——6th International Workshop on Algorithms in Bioinformatics. Lecture Notes in Bioinformatics (LNBI), vol. 4175. Berlin, Germany: Springer, pp. 355- 365, 2006. [15] Blum, M.Y. Valles, M.J. Blesa. An ant colony optimization algorithm for DNA sequencing by hybridization. Computers and Operations Research, vol. 35, pp. 3620-3635, 2008.
3
ERTAI-2010
DNA Sequence Reconstruction Based on Smarter GA and Best Rearrangement
[16] A. Nikolakopoulos, H. Sarimveis. A metaheuristic approach for the sequencing by hybridization problem with positive and negative errors. Engineering Applications of Artificial Intelligence, vol. 21, pp. 247-258, 2008. [17] J. Blazewicz, C. Oguz, A. Swiercz, J. Weglarz. DNA sequencing by hybridization via genetic search. European Journal of Operational Research, vol. 54, pp. 1185-1192, 2006. [18] J. Blazewicz, P. Formanowicz, M. Kasprzak. Selected combinatorial problems of computational biology. European Journal of Operational Research, vol. 161, pp. 585-597, 2005. [19] P.A. Pevzner. l-Tuple DNA sequencing: Computer analysis. Journal of Biomolecular Structure and Dynamics, vol. 7, pp. 63-73, 1989. [20] J. Blazewicz, P. Formanowicz, M. Kasprzak, W.T. Markiewicz, J. Weglarz. DNA sequencing with positive and negative errors. Journal of Computational Biology, vol. 6, pp. 113-123, 1999. [21] A. Ben-Dor, I. Pe’er, R. Shamir, R. Sharan. On the complexity of positinal sequencing by hybridization. Journal of Computational Biology, vol. 8, pp. 361-371, 2001. [22] Ji-Hong Zhang, Ling-Yun Wu, Yu-Ying Zhao, Xiang-Sun Zhang. An optimization approach to the reconstruction of positional DNA sequencing by hybridization with errors. European Journal of Operational Research, vol. 182, pp. 413-427, 2007. [23] Hongwei Xie, Qianqian Yuan, Ling Liao. Algorithms for DNA Sequencing by Hybridization: A Review. 978-1-4244-29028/09/$25.00 ©2009 IEEE
4
1234567689A96
2BCDEFB6BF
1234356789AB53ACD7E243AD9F 123456 78297A4 7 BCDCB4B75EF7 C9E 7 E 7824E5C46 7924 7E5CC92 75464258 7323456 7 29B 76F5D47 3234567454725473234567278EAC94747E7444967AF7E673FAC82CE97D49F4674C457E97 288437E947E5747E457347E5754FC547472FE57E7CB49C74457473234576EFB7A474D2F24B7 26727546425878E95CAFCE97E57276F5D47323457E675464258732345678E92C97275424B7E57648CE97 278297A478E96CB454B7276F5D47AF7C7C67F6F27A5C478E3254B7E7475467E74732345729B7E97 2BB546646727F879255E4576C847E747C4B 9F9C2AB7C92F 7EEB754642587323457267271234567848393A86BC68D36E5BF239747323457C672BB5466C974 7E5BEB736 7B28BA7729B7537287728C4D4B7 7B4685CA4678425727267A4497BE947A4E547E974735EA4729B7 27C67947 47E27E727323457C67E7B4685CA479ED47489C827546F6745472547EF573467E7489C827546F6!7 " 972E5C#7 $ 766478E965F8!76F8726725B2547B46C976E2547664735EE8E748#7 %947E27E747323457C67E7496F547274794&73456E97E7B46C967276647 C47EF567BE469'7247476247C6246729B724672BD29247E76E47E7EF57 A4676EFCE967(E72476F547274725B735EA467)29B74C576EFCE96*72547 BC68F664B729B7479E9+EADCEF67C62467)29B7E7E72DECB74*72547BC68F664B7 ),52C71255CB4*7 - 7345E5298474D2F2CE9!7EA2C94B75EF729264676CF2CE97E57426F54496#7 . 74E5!78E96C6C97E7278E48CE97E74E5467 73234576EFB7E8F67E97 1 B4685CAC9747546F67C976FC8C497B42C67E7462AC674C57D2CBC#7 1 CB49CC97479ED472634867E747546F67C47279479E4B47C67543E54B729B727 2467C79E9+EADCEF6#7 1 CB49CC97476C9CC829847E747546F6!727C35ED4496729B7C3287BE7476F467 C92742A429 3C827EFC947E727323457C6!7 1 A652873C8279E7E547297"//+"0/7E5B6#7 1 95EBF8CE9 7 )443 7 C 7 A5C4*! 7 C95EBF84 7 35EA4 7 EFC94 7 6EFCE9# 7 4 7 62449 7 E 7 47 35EA476EFB7C98FB47278425762449774735EA47C67C3E5297)E57C94546C9*7 1 1424B72E57)E57A4E5476F25*73C9!7 974782647E7278E945498472476F547E78C4747 E57E7471,78E+82C56729B7267297E4571,74A4567267254754E4732F6CA4726747 2675E729C97544D2975E747354DCEF67E735E844BC967 974782647E7274EF5927E57 225C9478C4729C97544D2975E7267$+-742567E576E7DEF467 1 %FC947E7475467E74732345!747542C9B457E747323457C67E529C54B7267EE67 97 (48CE97$747C95EBF847(48CE97-7B4685CA46776C92747B4685CA47FF547E57C97(48CE97 0778E4727(48CE97C67823C2C54B76E7D257EF574&35466CE97A44497648CE97A4C9747 6FA4487E74764949847267C97(48CE97$7BC68F66467729B7 97(48CE9747BC68F66797 1 :EB7E7323457 1 35EA47 1 2335E287258C48F547 1 546F67 34
A
1234567689A96
1 1 1 1 1
2BCDEFB6BF
4 7 AEB 7 6EFB 7 8E92C9 7 6FC8C49 7 ECD2CE9 7 C 7 2 7 426 7 E94 7 4&234 7 684925CE7 354452A 7 E 7 C 7 CF652C9 7 CF546 7 EE4B 7 A 7 2 7 85C63 7 4945C8 7 35EA4 7 624497 EB47C47F98CE92C7325C8F2574326C5C97947F98CE92C74732345727E57 279E7C98FB47E52C667;4945274D2F2CE967E7EF572E5C7E57258C48F54747 245C2735EDC9727472E5C7C67%)E78*7E745479E7C97474D2F2CE97648CE97 51D8318537E735E3E64B7664)6*7E728C4D47C67EB476EFB7A47E5474945C87297EF57 E97348FC257C34492CE97267C98FB47274267E947CF547 34248BA78E92C96728F27C34492CE97B42C67497C3449C97258C48F547C679E7 E2 7 652CE525B 7 49CE9 7 A5C4 7 C34492CE9 7 29F24 7 32E5 7 E82CE97 B4349B498C467E97E457328246729B7C9CF7546EF5847F6247C7345C9497 4248BA!73E7BE467C75427E57C973528C84<715EDCB475427E576CF24B7345E529847 45C86749B+F64576FBC46749CE974&45927489EE72BE3E567C7297487 1424B7E57C79E7BE9472747A4C99C97 (F25729B76FF5472E57 1 E4975434267472C97546F7 89E4B44967 :CACE5237 3349BC&7)E7A478F7C567C7E584B7E*!7 1 B42C4B735EE8E7B4685C3CE967 1 35EE67C7E547297E7C9467 1 E457E+4D47AF7C3E5297B42C67
7C67548E49B4B727EF75C47472335E28729B7546F67648CE967C567C87E7E445749747 35EA4 7 648CE9 7 C 7 C 7 C6 7 6432524 7 5E 7 4 7 C95EBF8CE9 7 EE4B 7 A 7 4 7 8E98F6CE96 7 49 7 47 C95EBF8CE97C878E4672676C9847C7E66467478E98F6CE967C97E947E7472673252523676C927 5C47472A65287=267CD47EF5732345727C4 834D9 1 DECB727AF747E67542BC7F9B456EEB72AA54DC2CE967 1 DECB 7 8EE9 7 352646 7 C4 7 9ED4 7 345E52984 7 4D2F2CE9 7 29B 7 258C48F54 7 6C9847 2E6 74D457323457BE46 727345E5298474D2F2CE97E76E4 7258C48F54 729B7C7A445 7A47 9ED47>946676E4AEB72967E76447"////7;EE47546F679EAEB764258467E5746473467 E7E5B67 >6472B448CD46727B4685CA4747BC6C98CD4742F5467E7EF57E574754C2A476822A47 C+345E52984 7 5EAF6 7 E+8E34&C 7 E5 7 E+8E6 7 )454 7 254 7 EADCEF6 7 4&843CE967 4749747345E5298474D2F2CE97C67478E547E747323457?D497C9727826476E4C97 E5476348CC87C67354452A47267C97@427426F544967E7A7E5747F2C7E7645DC847E57 64B?&7B4CD45C46*7 1 7 EF 7 944B 7 C963C52CE9 7 E5 7 2 7 32345 7 C4 7 EF 7 829 7 8E96F7 )3!BB863F5BF44BFBE46BB48B4662E3C849452E5* 7 2 7 ,( 7 1464258 7 E3C87 ;49452E57 EF42CA4 1 472A65287F6 7547 8E92C975445498467267C727A47F64B7CEF7472C9725C847 7C67 288432A472EF79E78EE97E7CB49C7E57A72FE572AA54DC2CE97E5716,79FA457 )6E574&2347%F572E5C7C67A264B7F3E9747E57A7(C729B72466E9*7 1 DECB7F647E7C97C67323457C97472A65287227E457323457EFB7EF7A472C972AEF7 454<7 1 DECB7494527ECD2CE97C97472A65287CEF7BE79E72D47E74F6C747C3E529847E747 945947E574&32C9727DE(7C67 34
8
1234567689A96
2BCDEFB6BF
1 3CC79E74F674735EA47AF726E74735C98C327546F6729734E347542B72A65286729B7 497B48CB474457E7AE457C7475467E747323457 1 (C9847472A65287C7A47F64B7A764258749C9467A476F54727456727CB49C7EF57E57 2547EF9B74547 97325C8F257479247E729735EE8E7E576647B4D4E34B729B747494527 25427)F2C7E7645DC84735EE8E7D45CC82CE97645DC8478542CE9749DC5E949*76EFB7A47 8E92C94B7C97472A65287 1 DECB74F2CE96729B727?&843CE96!7CEF5732345735E3E64676696167 542A435 1 DECB76E8729B78C84735264676F872675484972BD298467C97ACE7E5729C972FBC97E747 5E7E747 945947 1 :476F5472747C95EBF8CE9746747542B4579E727C67323457C672AEF79E74F67E7 C3E5297EF5749452725427E754642587C67142B4567E9'76C87C7EF7E57544732467E7 C9B7EF727EF725472C972AEF7 1 47C95EBF8CE97F67ECD247EF57E57A73C93EC9C974735EA47EF725472BB5466C97 29B 7 49 7 CD4 7 29 7 ED45DC4 7 E 7 EF5 7 2335E28 7 29BBE5 7 8E95CAFCE96 7 )29B 7 345236 7 4D49 7 27 494527B4685C3CE97E7EF57546F6*7 97C672747C95E76467F3774&3482CE967E57475467 E7EF57323457++7C735EDCB467478E94&729B727354DC47 1 14342C97472A65287C9747C95EBF8CE97C67272647E7632847 1 ?&2347A2B7C95EBF8CE9!7 345472747C96CF47E578E3F457546425874729B778E42F4672D4785424B7 47(>1?1;17664 729B 72D47233C4B7C7E764D4527E735EA46 72472B7 354DCEF67FA4B7C7425C457D456CE967E7(>1?1;1(C(?7E5727C47 C6 7 664 7 2E6 7 4 7 35E5245 7 E 7 426C 7 5 7 E6 7 E 7 32524456 7 29B7 35EA467AF7C98E53E52467276348C278E9652C976647E573252445764C967 29B7= (17(+4&35466CE973254946C678EF9C97 47642587632847E7;17C67254729B7297C967472547C9C972AEF73FC97 C9E7476F34536647C7247C67632847F87E5478EE5F7 1 73547EEB7C95EBF8CE97B52975E7?5C87(C44'678266!7 4A6A36B94A76CB56 3A3816E5B 5499A 653!5363B236E5B 549768B6F3 6 3"31836CB562BA 35649BA876BC6893#6$B563"49E23%68676F3A3C14268B6 3 6 3B23 6 E5B 5497 6 5318 6 411377 6 8B 6 2B&2332 6 484 6 45547% 6 47 6 A 6 7B93 6 4EE5B41D37 6 8B 6 7 A42 6 E5B1377A 6F8C4G4450H% 6 4A 6 E5B83A 6 73 93A8 6 12477C148BA 6F8C4G29B4E52IH# 6 'D7 6 8E3 6 BC 6 77839 6 48B9481422 6 E35CB597 6 9B53 6 E5BF239&7E31C1 6 3A A335A 6 8D4A 6 4 6 77839 6 8D48 6 41137737 6 D D26E53E5B137736484#6(B335%63B236E5B 5497694653!5369B536893 6 8B63"3183%67A1368D3645367B2A 646D45356847)#7 293F7273F7C2CAB!7 8E4727EF7829726E72D47275424B7E57648CE9727CD467E547B42C67 2AEF7354DCEF67E57 *A36468B61BA85B268D363"318BA68936BC63B236E5B 54976768B69EB7364A 6 4F7B283 6 893 6 298# 6 (B335% 6 8D7 6 7 6 8BB 6 1BA7854AA 6 C 6 7B93 6 8378 6 14737 6 53!5369B536E5B1377A 689368D4A6B8D357#6'B67361B9E848BA689363CC13A82%6 3B236E5B 54976978684)363"85468936D3A68676A313774568B6E35CB596322% 6 F86427B67E3A6237768936D3A3356EB77F23#6
34
1234567689A96
2BCDEFB6BF
E2CABFD435A5423435!7 47C56764949847E72732525237C47C676EFB762727478E95CAFCE97C67 6E7E66747546F67 +A 6 8D7 6 1D4E835% 6 3 6 A85B13 6 4 6 938DB 6 8D48 6 37 6 3B23 6 E5B 5497 6 8D3 6 A13A83 6 8B 6 785483 1422 6 422B1483 6 1B9E848BA 6 893 6 49BA 6 C8A377 6 14737# 6 ,E31C1422% 6 8D 6 4A 6 4 53 483 6 1B9E848BA 6 893 6 132A 6 9EB73 6 B35 6 4 6 735376BC6C8A377614737%63B236E5B 54976A49142261DBB736D3A68B678BE 6 E5B1377A 6341D6C8A37761473#6-36E5373A863"E3593A8768D4867DB68D486E5B 5497 6 3B2367A 68D76CB596BC6C8A377684)36237768936E3568378614736BA64354 3%68D 6 9A9426494 368B6B94A6E35CB594A13#6-36427B6717768D369E2148BA76BC 6 71D64689361BA7854A8%64763226476876CC353A1376C5B96B8D3564EE5B41D3768B6./8 6 928BF03183 6 E5BF23971# 6 'D3 6 A491 6 73 6 BC 6 537B5137 6 B8D35 6 8D4A 6 1B9E848BA 6 893% 6 3# #% 6 939B5 6 B5 6 C32% 6 94 6 427B 6 53728 6 C5B9 6 E241A 6 4A 6 4 53 48362986B35646735376BC6C8A377614737#7 92397 'D36CB22BA 67318BA6753765324836B5)6A6FB8D6BE89A 68D363"318BA 6 893 6 BC 6 3B23 6 E5B 5497 6 4A 6 3B28BA 6 B35 6 '5A &1B9E2383 6 53E5373A848BA7#623"8636A85B1368D36 4936'385764764683786E5BF239#6'D7676 CB22B36F6463715E8BA6BC68D364 53 48361B9E848BA68936132A %64A687 6 4EE2148BA 6 8B 6 '3857 6 A 6 E4581245# 6 -3 6 8D3A 6 E5373A8 6 3"E3593A842 6 537287% 6 71776B8D3561553A863CCB58768D6'3857%64A63A68D61BA127BA764A6C853 6 B5)#6
7 7C92 77777777,EE97:F67C9725CC9! " DECB7F647E73266CD4749647C727273E66CA47?&234!7 97428754645D2CE9754F467466247 2 7 545467 C945D2 7 F64B 7 A7 4 7 649B45 7 C6 7 C98FB4B 7 542B6 7 A445 7 29B 7 6E545 7 26 7 ?28 7 7 466247C98FB4677 $ >64765E97D45A67C9642B7E7E67E79EF96729B76C347456752457297298+6EF9BC97E9467 ?&2346!7 !92F9"79C#792F7$%7C7 B24"7F42567$%767 247266F3CE97 266F47 C6727F98CE97E7 B4349B67E97 C67297CF652CE97 CF6524676E67 C672754FC54497 54FC5467944B7E7 FCC5467 F6467 2B7BC4549847 BC454B7 - ,487E57C66C9725C8467325C8F257C7EF5792CD47E9F47BE469'72D47471EF7 8E98436729B78266467E7C967BE9'7E674D45C974647E5476348CC87BE467)1EF4567 5EF47328467475EF457258C48F547478E96CB457F64676275EB496*7@E9'7F64725C8467C97 5E97E735E34579EF96729B792467) 945947?&3E5457C67273E3F2574A7A5E6457478F55497 D456CE9 7 9FA45 7 C6 7 0/ 7 :C 7 ;246 7 BCB 7 9E 7 5C4 7 94594 7 ?&3E545* 7 78??@ 7 1% 8?17 3?1?97 . ?28764949847C972732525237F672D476E47EC8278E9948CE97E747354DCEF67E9476E57 4&234 7 C 7 2 7 B4685CA4 7 29 7 4&843CE9 7 )AF 7 E4D45* 7 B4685CA4 7 2 7 82F62C 7 )F67 34
1234567689A96
2BCDEFB6BF
454E547A482F647E7C6*7C9BC8247E728467E729725F497)E9747E94729B7E97 4 7 E45 7 29B* 7 49F4524 7 6FA+82646 7 )C56 7 648E9B* 7 E5 7 C9BC824 7 2 7 43E527 542CE96C37)49724525B6*7 7454725479E76F87C9678487C7EF576494984672547 C9B44B7325 7E7 4 7624 7EF77947EF7 6EFB 74 7C6 7E9 7 3252523 7AF7 6C7 84257944B676E47EC8278E9948CE97E7473252523672735484B4B7C7 0 AA54DC2CE9673C827BE79E724729725C8474D497C7474&329B4B7D456CE97BE4678E4727 2AA54DC2CE967E57E529C52CE967BE724727B4C9C4725C847267C9747 ?67629B25BC54B7,17 =,@ 7 =CFCB 7 ,562 7 @C632 7 C6 7 29E45 7 8EE9 7 8264 7 454 7 29 7 254 7 434B 7 E7 C98E554875C47=,@7BC632 I >6478E96C649749647)3546497F6F27F94667543E5C97546F6728C4D4B7C97425C457323456*7 J 32BA33682972474C4576C9F257E573F527D45A67B4349BC97E9747C949B4B7429C97)E57264*7 :E7ABA36BC68D37369784)37645361B99BA729B7ABA36BC68D37369784)376761B99BA7254778E5548 K >64 7 3496 7 E5 7 8E9824924B 7 E5B6! 7 49B+E+49B 7 258C48F54 7 542+C4 7 E3452C97 6647)AF7478E3F4572729254747546F67C975427C4*7345+E7F4F4C97 E+492A4B7A28+E+A287487 97494527349672547F64B7 1 2BBC97354C&46727EFB7546F7C97BEFA47DE467)4&8437E578E+7B4+7354+735E+*7 476F352+2FBCE5#7 1 2+!72+25EF9B72+4A528C9#7 1 2+!72+2644372+BE257)AF724254B722*#7 1 F26C+!7F26C+3FAC87 1 64+!764+8E968CEF6764+644C97)AF764EEB764466*7 1 E7BC6C9FC675E7276ECB7EE52374754+287D675428754+3E647D67543E64754+ 6C97D67546C9754+6ED47D67546ED4754+42647D675442647 1 78E3EF9B72B448CD472B47F37E72972B448CD4729B7279EF97C978EAC92CE976EFB7 F6F27A4734924B7)2C737$-/*7?&2346!78EB+6E5247D2F7E+2C5742C97 6E5+457E297542+C47E3452C976647233C82CE9+6348CC87C94524B78C58FC7 94594+A264B7 1 E5B6749BC97C97+C474974735484BC97E5B749B67C97''74764+C47 L @E9'7ED45F647B26467E57643252CE9726747C9455F3747E7E7E5B67@2646727A47 2335E35C2474547EF7297E78E95267EF67D45765E97E5747B2673257C67276F535C647 E76E476E57C97E7C726727D457E9732F6474976342C97 97297826467278E2+ 6432524B7352647E567A4457 7EF7BE7F64727B2672476F547C'679E7273497)+7C97=24A*7 AF72974+B267)+++7C97=24A*7 "/DECB7682547FE46726747C9BC824727475C457C67BC6298C97C6475E747457 ""8FA4567497E57466725476344B7EF!7 78E96C667E75447C4B679E7-7C4B67 "$>647F9C7C9642B7E7478EEFC278227 "->647!#6479E7!48BA647F94667EF7944B7E7C743732467 ".*E89427829'7A47C35ED4B7+79B536BE8942276EFB7A47F388357E572A479B536A34526BE89427 "0DECB 7 C9+C94 7 49F452CE9 7 C4! 7 12846 7 829 7 A4 7 )2* 7 E6 7 )A* 7 6E49 7 )8* 7 4 7 4 7 47 49F452CE97E97C9455F36747E7E7EF7 "IDECB 7 C4C52CE9 7 )AF46* 7 26 7 4 7 24 7 F3 7 4&52 7 63284 7 29B 7 24 7 4 7 32345 7 542B 7 C47 1E451EC976CB467:F4678297A47F64B7448CD47E574326C67E7473EC967 7EF7297E7 B4685CA478E3E94967E572E5C67E49747B4685C3CE9749DC5E9497E567A4457267C7 CC674745735EDCBC9727E+4D47648CE97B4C942CE97 "J 9642B7E71445498477"976E67E577"976E67F647(C77"976E4B7E57(C729B7 ME946 7 7"9 7 6E4B 7 E5 7 (C 738 6 42#7 7"9 7 6E4B 7 )C 7 E54 7 29 7 E 7 2FE56* 74 7 27 C67 494527F64B7E573234567C7E547297E72FE567)8E472747272467476FA4487 3F5276E7C7C67(C747277"976E79E76E6*7%5724592CD4747EEA25735EE8E77"97 C672974&23477C674436747542B4575E72DC97E7C37A287E74754454984672674'7 548E9C5472978C2CE967A74C4572FE579247E5735E448792478E7944B7E754457E 716,7 9FA456 7C974 74& 7 )4&843 7 C9 716,6 729B 7 94594 7@526* 7 ?&843CE9 7E5 7D45 7E+4D47 34
1234567689A96
2BCDEFB6BF
3546492CE9!716,K$$+6472BB5466467 "K>6479E527823C2C52CE97C97823CE967)C67C6727823CE979E7C67C6727,23CE9*7 "L742BC967F67A47823C2C54B78E96C64974C457C9742BC97647823C2C5C97E5B67E57 64949847647285E667274D467E742BC967;494527823CE967E57CF546729B72A4672547 A46747C9764949847647 $/1254946467E57A528467254722676F55EF9B4B7A72763284!7474&345C49)6C7J*6E67C67 5E9#7474&345C497)6C7J*76E67C675C7 $"DECB74&8466CD473254946C54B754256726747247474&725B7E7542B#7EB7C9E7472C97 64949847,4874457473FAC82CE972E67EE9E467+76E47225C94675E97F3E97 47E547297E7EE9E46734573247E572729BF73457323457C6727A2B76C97CEF735EA2A7 6EFB72D47233C4B7E72768EE7C9642B $$47245C276EFB72474F67267F87649647CEF747EE9E467 747542B4578E96297 267E7EE727EE9E467472547C47E7E6474C57E5CC92732847C97474&767272457E7 2647 7C9B7>1=67A44573284B7C974754454984675245729726727EE9E4726747542B457C7 9E72747EE9E47C674F67275445498479E7245C27C3E5297E57F9B45629BC97474&7 $-4547C679E7632847A44497474&729B7476F345685C37E5747EE9E47 47C97=24A7C'67 4&FEE9E4GH7524572974&7FEE9E4GH7 $.,487272AA54DC2CE967254722674&32C94B7A4E547F647?&843CE9674972BB54664B7E7 472335E35C247,(72FBC4984!7,@7,7,(?7,1,37,@+1%748 $084D45762572764949847C729B7)454725474&843CE967E7C675F47AF746472547A46747E7 ?9C6724E56*7 $I@E9'7F6478EE967)!*7C97CB+649498476E574&2347C67C673E66CA47A482F64!76E4AEB762CB7 6E7C675E97+7473257A4E547478EE97F67A47278E344764949847 $J@E9'76257649498467C72'67A482F647 $K 97E5275CC978E9528CE967C47BA387B37A387BA387E57C'6725474945272DECB4B7 $L:478254F79E7E78E9F647C67C7C'67)C7C6*7 -/N2574&35466CE967E78E325C6E9!76C97C6726457297B5CDC97C67F87A44572976C97 267472BD29247E7A4C9726457E57472BD29247E7C97C6727C7C6726457 -"@E9'7F647626+8E965F8676F87267C4BE947C67C67288432A47E576CB467AF7C97E527 35E6476F874&35466CE9676EFB7A474&329B4B7C9E7C47E57E947E57C4729B7E947 B4349BC97E9747429C97C949B4B7 -$DECB78C8467C475484972BD298467C977 --@E9'7F6476AE67C47O7)E5729B*7P7)E57528CE97E5734584924*7E57+Q7)E57 EE67E57C3C46*7C9735E647EF6CB47E74F2CE96746472547E97288432A47C976CB467 -.DECB7823C2C52CE97E7456 7CEF57323457C679E747>(7,E96CFCE9 7E57@48252CE9 7E7 9B4349B49847489C82745672547C97E45+826472EF76E4734E347F647F3345782647497 4&32C9C97297285E97267C976985E9EF675296457EB47)*7 -0?&329B727285E967E97C567F6474&8437285E967274D457542B457C674&3484B7E79E7) 97 2754642587323457E97(N74&329BC97(N7C6735EA2A79E7944B4B7+76E4AEB7E7BE469'7 9E727(N7629B67E57C69'7C47E7233548C247475467E7473234574C45*7 -I?287325252376EFB72D472742B7649498476F25C5C97C678E9497 7C67BE469'7E57 92F52 7 4 7 3252523 7 C6 7 35EA2A 7 EE 7 6E5 7 5 7 542BC9 7 4F6 7 4 7 C56 7 C946 7 E 7 4287 32525237+7473234576EFB76C72476496476E574&2347 454 7 254 7 E 7 645DC84 7 EB46 7 C94524B 7 29B 7 BC4549C24B 7 645DC84 7 94524B 7 645DC847 EE6747;452972335E2872729C9727C69'74&3C8C72E4B7C67D45AE497 765C87 54F246752C87AF726E724674752C9675F97E97C47@C4549C24B7645DC847EE6747 9C27625723352E8745476E4752C87C67E5474F27297E4567 7644676C3457F9C7 E947267E7E5572AEF735E425C2752C87B5466C97F372674725C6E85287 -J 9724&7F647RCR79E7RC+R7 -K>9C6725472267C975E297E9794D457842177E57=24A727EB47>9C672547647E7A7E947 )C9*76328475E7479FA457 97=24A7F647S7E72DECB763CC979FA45729B7F9C67285E667 E7C9467F#7E57F735EBF846727C97632847 34
1234567689A96
2BCDEFB6BF
-L6E57542B2ACC73E4567E727"///72547BCDCB4B7A78E267 ./>647AB67E57AB679E7A367E57A367+747245725479E768C49CC87F9C67:478254F7E7 BC6C9FC67A7)42AC*729B7:7)42A46*7C97325C8F257A7)"///7AC6*729B7T:7 )"/$.7A46*7 ." '672267357)E45+82647*79E7T357E57T3E .$>647679E76487E57CC648E9B67 .->647/07C9642B7E707C47BE79E7EC747545E7C975E97E747B48C273EC97)-B576A8B6 'E37548E49B6727E57F29CC4674667297E94727545E76EFB7A47647A4E54747B48C27 3EC974&8437E57F29CC4672794D4574&844B7E94*7 ..DECB748# 7 F64 7 E5 7 4&234 76F8 726 7 2E9 7 E456 7E5 7A445 74 7 5 7 E 7 CD4 7 27 8E3447C67)F946678CC97E574&234727C67E735EBF8679E97E7A47C98E344*74D497C7 2A65287(44726E7(5F9729B72C4!7 ?8!78E7E7A47F64B7E73456E967?FCD2497E74A68D365378%64A67B6CB58D729B7498479E7 E7A47F64B7C7E947E74647EFB7A47C96FC8C49727C67C747542B457EFB7A4747C97 BEFA7267E7297C3E5297325C8F2567=4267E3497E7EA448CE97497C7543546496747 2674567E727C672542B7CD497C97F7E57C245C27E5B67274749B7E727FE2CE97 74749B7E727C67C95EBF84B7A771D647%6CB563"49E237E572976CC2574&35466CE97487 C67C98E55487 .0 7EF7627E574&2347E57C47BE79E7EE7C67C7487F67C'675FC7C47233467 A292926729B7E52946747C4729B7E574&23472542B7C9BC82472745472547E547 6F87C467 .IDECB7AF44B7C667E7E94+6494984732525236747247EF57323457EE7C47276CB47 3546492CE9729B7C9454547C76EE7542BC97 .JDECB 7 4&8466CD4 7 F64 7 E 7 C4 7 N25 7 EF5 7 4&35466CE9! 7 6F8 7 26 7 C6 7 4296 7 27 A482F6477 47C679E747F9CD456278E94F98CE9U7 .K144A45727C4729B747254742477EE4B7A7278E27 .L@E79E7F6472345629B67)V*7E57626+2AA54DC2CE967)6F872676B7E57B*7C97E5275CC9#7 472547288432A47E576CB467 0/546348CD4 7 C6 7 35484B4B7A 7 27 8E2 7 26 7 C9 74 7C 7 AFA6 7 264B 7 "/ 729B 7 "// 7B267 546348CD47 0"'D353CB537DB3357D3A13729B78D772547F6F27EE4B7A7278E27267C97454E547EF57 CB4276EFB79E7A47C34494B7 0$84D457F6475424B7E5F7F94667EF725472C972AEF7E567E7257 '675424B7E57 0-(CC25 7 8EB4F 7 5445 7 E 7 49853CE9 7 46 7 9E 7 FC34 7 35E526 7 CEF 7 EFB 7 62 7 7 EBCC4B7FC34735E52679E7FC3478EB467 0.>647C976CF547"7C9642B7E7EEC97CF5476C9847CF54672747ED4B7BF5C9747 3FAC82CE97E573464C9735E84667@E9'7266F472747=24A7CF54762674547EF73F7 C7 004&78EF967C972A46725474+2C94B79F45C878EF96725472C94B7E9747B48C27E575C+ 2C94B7 0I(48CE976CF54729B72A472547823C2C54B7267C9767BC68F664B7C97(48CE97-76CF5478297A47 2AA54DC24B72676C7AF747E456725479E7F6F272AA54DC24B7AF72'67272457E72647+7 4F67A478E96C6497 0J(48CE97C4672547AB87EE4B7A727345CEB7 0K 97=24A7C4747CF5479FA457E7475445498476E727C7BE469'747A5E497285E667E7 C946!7 6CSF54GC!258H 0L@E79E7F647; 67C2467E57CF5467267; 66735EBF847E55CA4735C97F2C729B72547F47 ?&3E57C9E71E6(85C377276247EF'742597E7233548C247C85E6E735EBF867EE7 6E7&C729B79F3E749452735EBF8471E6(85C37278297A47C98FB4B7CEF7BCC8FC467 I/%9 7 F64 7 C94 7 5236 7 49 7 EF 7 254 7 5C9 7 E 7 6E 7 2 7 F98CE92 7 E5 7 82F62 7 542CE96C37 34
1234567689A96
2BCDEFB6BF
A44497D25C2A46724976EC97BC454974&345C4967E574&2347F647A25752367E57 6824573E67 I"6CF54676E7B43C87C9BC8247CF65247DECB7)54457E76C7"J*7%497C7C6749EF7E7 6C373F747CF547544549847C973254946C6 I$ 7EF7FE476E4C97C4527498E647C7C97FE2CE972567E576E7C7C9B494B729B7C97 62457347)AE87FE4*7745478C2CE97C679E76FC8C497267C7BE4679E74747542B457 4457EF76C37B45CD4B7EF57245C275E7478C4B76EF5847E578E3C4B7C7D45A2C7 I-489C827543E578C2CE967F672D47479247E747E529C52CE976F8726747F9CD456C7E57 8E3297,E945498467F678C4747E82CE97 I.@E79E754457E78EE567C9752367E6734E347C735C9747323457E9727E9E85E47)A287 29B7C4*735C945729B7C72D479E7CB42727EF725472C972AEF72476F547275237 C94672547426C7BC6C9FC62A4749735C9C97E9727E9E85E4735C9457 I0@E79E7E547E7289E4B47EF57F9BC976F33E57 7EF7BE7E547EF7279E72D47297 E7289E4B47C9747FF547 II,487EF575445498467E72476F547472547F37E7B247 IJ,E9454984754454984676EFB78E92C9747E82CE97E7478E9454984747E9729B76E47 C9BC82CE976F8726715E87E7E57,E94549847MEF5927544549846722678E92C9747DEF47 C66F479FA45729B732467 7F67A47EADCEF675E7478C2CE97445729725C847267C9727 4EF5927E57C97278E94549847 IKDECB79FA4567C725CC8C273548C6CE97>94667EF72D47BE94749EF74&345C4967E7A47 6F5472747D2F47426F54B7C67C9B44B7429C9F7E7CD47BCC67245747B48C273EC97 EF'547ED4562C97EF57546F67 3D362CB 1 DECB7F647E 738642#7 C9727ACACE5237F94667C67C67D457E97)CD47E57E5472FE56*747 2FE576FA6F4B7C9E738642#727A47EF572BDC6E57E574754DC44578E473F98F2CE97E738642#7 1 945947B5267F67A47254B7''E57C9735E5466''72476F54727472D47A4497543284B7 A794457D456CE96797 945947@527544549847EB4572976C&7E9676EFB72FE2C827 A476F63C8CEF676C9847 945947@52674&3C547245727C47345CEB7 1 :EE78C2CE967C98FB473FAC82CE9742567AF79E7 (:879FA457 1 7C679E7288432A47E7C98FB47>1=67E7245C27AF7C7C6735EA2A7A2B7E57E7C98FB4727 >1=73EC9C97E7472FE5'674A73247E5732345673FAC64B7C97 ???729B7,73FAC82CE967 CD497478E35C76CF2CE97>647C7E576E254729B7E4579E9+CA5257245C27DECB7E97 >1=6#7C727A476FC8C497E73EC97E747494527324729B74747542B457C9B747245C27 ;494527>1=67254726E74667C47E782947 &343567'(F 6EE74647FCB4C9467497F6C97>1=67C97EF57323457A264B7E974732345 75357783A136BC6-3F6 3C353A137 6 A6 ,13A8C1 6 373451D7 A 7 (4D4 7 =254984 7 @2DCB 7 8 7 1499E8 7 ;25 72CC2 7 6247 1EA457T5ED4576529677,E45447?5C87;ED4576C9975F378C46499B5C467T5F45729B7,7=447;C467 C976B9E835764A7$//"7BBCCE9275425672D47A44972BB4B7C97A528467 77777776E57245C279E78E95E4B7A7472FE5!7 1 15EDCB47E5278C2CE9672E97C7>1=78C2CE967494D4573E66CA47AF7C98FB47D2F2A47 >1=78C2CE9674D497497E5278C2CE9672547F92D2C2A4747E667E77B937C967ED457C47 C67354452A47E7B435CDC97542B4567E 74227 C967?D497497E5278C2CE96725472D2C2A47 35EDCBC97297288E329C97>1=782976C9CC8297C35ED4747C9E52CE9'67288466CACC7 1 15EDCB4749EF78E94&7C9E52CE976E727542B456782973E6472B4F24764258749C947F45C467 E75287BE97C9D2CB7C9676E574&2347497CDC9747>1=7E572735435C9735EDCB4747 BE8F49'67F7C472E97C747F7B42C67E7472FE567+7C9642B7E7627F6C974727
34
1234567689A96
2BCDEFB6BF
1 ,48727>1=67325C8F257E9467272D47233254974A4BB4B7F6457C9E52CE97 7C67A467 E576E4AEB7E457297472FE57E784874647>1=6 77777777 9782647E7>1=67278C47543E6CE5C4678E95E4B7A7472FE5!7 1 12847245C27C972754C2A47849527543E6CE576F872672735435C97E576E2547258CD47C67C67 325C8F25 7 C3E529 7 E5 7 C96 7 E 7 8E344 7 D456CE96 7 E 7 323456 7 EC4B 7 35EE6 7 29B7 6F33E5C97B227E57546F677E67282B4C87C96CFCE9672D4727489C827543E57645C467 7E947 E 7 4 7 8E+2FE56 7 C6 7 2 7 4A45 7 E 7 6F8 7 29 7 C96CFCE9 7 65E97 8E96CB45 7 328C9 7 4 7 F7 D456CE97E74732345745476E57E349+6EF58476E25478E96CB457F6C97475447645DC8467E7 6EF584E54947DECB7F6C976FB497288EF96727278E6#7F6473EC94567E7546425872A7E57 5EF3732467267472547E4972C92C94B74D49724574728F7E57546425876273456E97 42D467297C96CFCE997 1 824 74 7543E6CE5729B7C98FB4 7 4 7 924 7C9 78C2CE96 7C6 79247C6 72D2C2A4 7 E5 72457 642584676E576E2547BC65CAFCE967C98FB4727C47C79247E7476E254732824#76E47 64258749C94727C9B4&747C492477DECB76E5C974F67257C467E574&234774267 6E5474729F273247E571?@?72672764325247BE8F497E7C9854264747C4CEEB727C7 C 7 A4 7 C9B4&4B9 7 15EDCB4 7 2 7 BE8F494B 7 E4324 7 E5 7 6E254 7 29B 7 462AC6 7 2 7 BE2C97 9247 1 2497 5445498C9 7 6E254 7 E5 7 6E254 7 29F26 7 54454984 7 2 7 >1=7 E5 7 4 7 49C54 7 35E4487 52457297>1=67E576348CC876E2547E5729F27D456CE967N456CE97C46754F497A48E47 F92D2C2A472457F3B2C97E7476E2547E5729F2776E7EF735EA2A7297542B4567E7 EA2C97478F55497544264749747542B74725C8475245729747D456CE972D2C2A472747 C474725C8472675C4997 1 DECB7>1=6727B4349B7E973456E927BC548E576348CC8728C947E576FA9479247 1 =42D4727632847A44497C5679246729B72679247C47M717@E479E7M1@E4 1 14454984676F87267)B57*9"7+,97C92757F,94B356+"749AB53ACD72924-7 2547F644667,C47476EF5847B24729B7E457CB49CC97C9E52CE97 1 6E5 7 8E9454984 7 323456 7 EF 7 >( 7 924 7 4 7 8E9454984 7 E82CE9 7 E9 7 29B 7 4 7 F7 8E9454984792479E74F676E472AA54DC2CE9712479FA456725479C847AF7E3CE9277E7C67 C9E52CE97C67542BC72D2C2A47DC2747 ???7E57,7BCC27CA525C467 1 32DC97278C2CE97)C597*9"7+,972C5,7C92+"74797D3FB9"7.//0C67F644667267473234572673546F2A7A44973FAC64B7A79E7;EE47E5747,7E57 ???7 BCC27CA525C467C7437EF7C9B7C7 EA#5D969,954F 1 89E4B47EF57F9BC976EF58467(E476EF584672D476348CC87E5BC9754FC54496729B7 27 35445 7 2 7 4 7 529 7 9FA45 7 C6 7 C64B 74 7 8(6 7 54FC546 7 4& 7 C4 7 C6 7 E5 7 267 6F33E54B7A74782CE927(8C498476EF9B2CE97F9B4575297? 788+888887 1 ;49452729E9EF6754DC44567BE9'747289E4B4B7F946674 7542 735EDCB4B 7297 4&843CE9274D47E744BA287E57C96C712457297247297A7E5743C97F67C7C7 EF7C7D257C67267A7434B7C7C7 92435671,923ACD79FD4F7C573,DC435F 9727AF74&49B4B72A6528679F45C827546F6729B76CF2CE9676EFB7A47543E54B7C9749EF7 B42C 72 747 542B45 78297BF3C824 747 546F67C6 76EFB7 C98FB4 72 732524456 7F64B7 C9BC82CE96 7 E 7 4 7 9FA45 7 E 7 62346 7 2 7 8E95CAF4B 7 E 7 4 7 2926C6 7 29B 7 29 7 C9CC27 8E9BCCE967C7544D297 2497354649C976CF2CE97546F6735EDCB47C96C7C9E74762C6C8278E9CB49847 72727 3E66CA4735EDCB478E9CB49847C945D267 7454'6727652947A42DCE57C97475237)4727 BC373427E5782947C976E34*7C67A42DCE574C457944B67E7A474&32C94B7E575426E967F67A47 CD4977 C67 C6 76C37 BF4 7E 762C6C827 2A4552CE9 7 97 4 7245 78264 7245C97 E547 34
1234567689A96
2BCDEFB6BF
623467C6735EA2A72BDC64B7 6CF546 76EFB7A4 7 8E6497 C64 7CEF 7 829 7 94D45 7 2 7EF 7 4 7E4 7 3252445 763284 76E7 35EDCB47C96C7C9E7C8732524456725476C9CC8297ED457275294729B7C87E94672547 4667C3E5297 '679E7D45749452C9C97E73546497E67E727E57C94257C9467 47B4685C3CE97E74752376EFB79E74F6754342747523C827EADCEF676F8726747 B4275C6467C747E2B7AF74&32C97E574&2347E7C67C98542647542467E747E2B7 C9854264 7 6 7 C 7 C9425< 7 @E46 7 C 7 EE 7 6E4 7 4+9E9 7 E45 7 664 7 A42DCE56 7 6F8 7 267 629B25B7F4F4C976646<7 (C8927&5F392C435F 1 454'679E7944B7E7498E6479FA4567C97RR7)27EB4*7 1 >647F8C4G2A8H79E7F8C4G2H7F8C4GAH7F8C4G8H7 1 >64747FF6432824GC46H7E3CE97E57=24A$47+7C78E467EF7F879C8457E9735C94567C7 BC45497546EFCE9671F678E3254B7E7857C735EA2A76F4454672974&527"/P7E74&7EF7E7 EF578E945498472E497 1 FC+44576FA685C3672547647C975E2979E7C2C8676E574&2347 &WGF2572&H 1 E57F9CE5C7F64747=24A$47523C8676479E747425C45736CF54764!7 FF6432824G523C86H FA4C9GCF54H F546C54AE&GUHG/0F4&4CHGFC98FB4523C86GEE436HH F823CE9G(E47CF54H F2A4GC!CF54H F49BGCF54H
8B356F747E3 "*7EE7F87ECD2CE927245C2 54475426E967254749EF7++729B7476EFB7A47B4685CA4B7D457A5C47 $*7@4685CAC9747EADCEF6732567E747546F7 %ADCEF6 7 C6 7 B4C94B 7 26 7 29 7 546F 7 2 7 2 7 52BF24 7 E 7 EF5 7 35E52 7 EFB 7 6F46 7 26 7 27 6EFCE97C7EF73E6474735EA472747546F76ED467 -*7@4685CAC97F9948466257B42C67 7B42C7C67F9948466257C7C67EC66CE97C79E725747542B45'672ACC7E7F9B45629B747 C3E52979ED472634867E747546F7 .*7(34C97455E567 2C 7 4 7 2D2C2ACC 7 E 7 634 7 848456 7 454 7 C6 7 9E 7 5426E9 7 E 7 2D4 7 634C9 7 455E56 7 C9 7 27 29F685C37 7EF7267472FE57BCB9'724747C47E7634+8487EF5732345776EFB7 474BCE57E5754DC445724747C47E7542B7C7E575F6727EF57BCC49847C97489C82724567 C67297C457297EF57BCC49847C973546492CE9<78E47E4D4572763478484567BE9'7 8287278EE97455E567C97325C8F257E5B7BF3C82CE97)474*7 7C97BEFA78E96F727 BC8CE92576F8726747)E97C94*7455C2724A645 0*74&7C975C2!7 5C2729B7E4576296+645C7E9672547C947E576CB46729B73E64567AF7254725B457E7542B7C97 8E9C9FEF674&7>647C4671E297E576CC257645C7E967>9F6F27E96725474667C47E7A47 2D2C2A472747548C3C49729B72782F64735C9C97E57BC632735EA467
34
A9
1234567689A96
2BCDEFB6BF
339D359F7 2745923,954CD7C92F ;FCB4C9467E57?&345C49271234567647E57E57546425845676FACC9725C8467E7474EF5927 41DA367345AA 7 " 123456 7 2 7 C95EBF84 7 2 7 94 7 4259C9 7 64C9 7 E5 7 34 7 E 7 233C82CE9 7 6EFB 7 4F6C 7 47 544D2984729B7C3E529847E7C6764C97E574&2347A264B7E97C67FCC7C97233C82CE967C67 2335E35C249466726727EB47E7F297E5729C274259C97E57C67C3E529847C972BB5466C97 F9B24927F46CE967C9728C9474259C97 $ 1234567B4685CAC97279472E5C76EFB7A47842573548C64729B75C497C972727272E67 4 7 542B45 7 E 7 8E3254 7 4 7 2E5C 7 E 7 E45 7 2E5C6 7 6E5 7 4&234 7 E6 7 4259C97 2E5C6 7 829 7 A4 7 DC44B 7 26 7 E3CC5C9 7 )2 7 426 7 2335E&C24* 7 6E4 7 426F54 7 E7 345E5298477EEB727E7B4685CA47279472E5C7C67E7247C67345E529847426F547 4&3C8C79E457F64F727E7B4685CAC972972E5C7C67E7B4C94747632847E73E46467 27C764258467497E3CC5C9747345E529847426F547 - 1234567C95EBF8C97279472E5C76EFB78E9BF874&345C49678E325C97C7E7624+E+4+ 2572E5C67E57476247E576CC25735EA467245473E66CA47345E5298476EFB726E7A47 8E3254B722C9672972A6EF47629B25B7E7CB427345E529847145E5298476EFB726E7A47 8E3254B722C9672792CD47629B25B7)47529BE7F466C97F466C9747E678EE9782667 48*726747>9F6F27345E52984785C45C276EFB7A478254F7B4C94B729B74F6CC4B7 . 74&345C4967F67C98FB47426F5467E7F98452C97E7478E98F6CE96746473C827 24747E57E78E9CB49847C945D26762C6C8274667E5746C2467E7629B25B7455E5715E3457 4&345C492 7 4EBEE 7 6EFB 7 A4 7 43E4B 7 6E5 7 4&234 7 C 7 46 7 646 7 254 7 F64B 7 E7 426F54749452C52CE97345E5298479E7C9E52CE975E7474676476EFB7A472D2C2A47E7 474259C9735E84667 0 @4685C3CE96 7 E 7 4 7 6E254 7 29B 7 B22 7 6FC8C49 7 E 7 543C824 7 4 7 4&345C496 7 F6 7 A47 C98FB4B 7 C9 7 4 7 32345 7 %984 7 4 7 32345 7 26 7 2334254B 7 C9 7 28C94 7 =4259C9 7 2FE56 7 2547 65E97F54B7E724747B227F64B7C974&345C49672D2C2A47E7E45768C49C667C6C97E7 543C8247474&345C4967974&8449727E728C4D47C67C67E7B43E6C747B22764672747 5DC947143E6CE57E728C947=4259C97@22A264679E457EEB7E3CE97C67E72BB7EF57B227 6467E747@?=N?7A4982578E48CE972747>9CD456C7E7E5E9E76E5735E35C4257B227 646 7 2FE56 7 254 7 498EF524B 7 E 7 B4D4E3 7 694C8 7 B22 7 646 7 2DC9 7 4 7 624 7 62C6C827 35E345C4674647694C87B22764678297497A472B4754472D2C2A47 I ,E98F6CE967B52975E727645C467E74&345C49275F9676EFB7A4784257624B7;523C827 BC6327E74&345C4927B2278297A47D457448CD47(F33E5C972A467E74&2879F45C827 546F675E74&345C49676EFB7A4735EDCB4B7C9729723349BC&7 J =CC2CE96 7 E 7 4 7 2E5C 7 6EFB 7 A4 7 B4685CA4B 7 C9 7 B42C 7 94546C9 7 82646 7 454 7 297 2E5C72C672547C3E5297C97825CC974752947E7233C82ACC7E72972E5C7 4B9276354F7C57149F 65E7:C7(4257)(26BE727J7$//I*74BC4B7 1 25C47C4727946323457543E54579E72752B76FB497 1 CEF57EA448CD47C67842578EF9C82CE97E747542B4579E7A42F7E5745FBC494667 E5792552CE97E7EF57BC68ED45C46729B75426E9C9735E84667@E9'726474C57C47 E57274267BE9'72647C7F375E97 1 3C747C3E52978E98F6CE967C9747C567476494984676E7EF57542B457C7542B7 47 7EF'B7C47E75237F37C747274749B7E7EF574E72'67C947 EE7C978264729AEB'676C7542BC97A7497AF78E98F6CE9678E47C567 1 7 EF'54 7 5C9 7 E 7 4&35466 7 6E4C9 7 8E34& 7 6C3C 7 EF5 7 5CC9 7 6E 7 C7 BE469'747C9747276E576E4C976C347"/752B4729F24765F8F5467 C7BE7AF7C7C'6754272C576F7A287BE97E7K752B47E576E7 1 C972AEF727EF572FBC498479E6729B7BE469'79E729B72747297 29B7BE9'7297?&354667C967C974567E727479E729B72979E727
34
AA
1234567689A96
2BCDEFB6BF
EF79E7 65E725F6D7(26BE727J7$//I7 1 87579F3657 (25C97C7297EFC94729B7E5C97EF747B42C67C6747 9E52727E728C9729749C9445C9735EA47 1 &B9A#356727 CA4F7 ?9C9445676EFB7A47F64B7E7848C9729C9727C67 4D49754E47BEFAF7A4E5478ECC97E7C7(E76EFB75C4567 1 7C3D297,97C5CDF3F7 6E57428764949847267EF56478EFB7C7A47C6542B<7 3E<7227C6747A46727E7C&7C<7 1 *99595A7C5CDF3F754747CB42673546494B7C97297E5B45727266F5467274287 3EC978297A47F9B456EEB7E9747A26C67E747542B4567266F4B79E4B4729B747 C9E52CE9735EDCB4B7A735484BC973EC96<7 1 43,38C43575474547297F99484662573256<7@E4674765F8F54754FC54747 542B457E7544A457E7297B42C6727E9847A4E547C9C974<7 1 42A429 7 49F43567 7 EF 7 542B 7 2 7 EF 7 2D4 7 5C49 7 266FC9 7 E9 7 47 9E4B472747542B4578297A474&3484B7E72D47BE46742873257E574727 EF7C949B4B<7 7EF7542B7C72EFB7BE467C76EF9B74727EF7C949B4B<7 6C927EF76EFB7CD4747323457E76E4AEB74647E7542B7 7EF78297C9B7E734E34!7 E9473456E972CC257C747489C8272457 29E457E974945272CC257C74725427 8B97&5 9295A9793972A9FF 7C6725B7E749452C5474754DC4735E84667E578E945498467AF7E67543F2A478E945498467E345247 288E5BC97E74647A26C875F46!7 " 47323457C676FAC4B7E747489C82735E52782C5)6*72978F554978E94549846754FC547 4485E9C876FAC66CE97C974C4571E6(85C37E571@67E5267E8826CE927C972E5B7 $ 47489C82735E52782C57266C96747323457E7E947E57E547489C82735E5278EC447 4A4567E34F74&34567C974C57C4B747CB49C7E7C671,74A457C67437648547 - 471,74A457F6F2735EDCB4672754DC47AF72726E7A47264B7E7C9B7A44497E94729B7 544754DC44567E725479E74A4567E7471,74727A478E42F467E74754DC445727 476247C96CFCE97C67E5745752BF2476FB4967E576E4AEB7C64B7C9747544549846747 52BF2476FB49754DC46 78297A47FC4743F76C9847464754DC44567E49735EDCB47E547 B42C4B785CC8C6752457297A2947BC6C662797EEB78E94549847C765CD47E735EDCB47 274267544754DC467E4D4576C98478E945498467E345247F9B457C7B42BC946729B79E727 54DC44567B4CD45726735EC64B7C7C679E7F98EE9727EF75484CD47E97E754DC467 . 976E478E9454984674547C67297E9+C947BC68F66CE97E732345672E974754DC44567E5727 325C8F257323457>6F272742B71,74A457B5CD46747BC68F66CE9729B7497548E49B67 47323457E57288432984754448CE97E57BC68F66CE9727471,744C97 0 47489C82735E52782C574978E48674754DC46729B76E567473234567288E5BC97E74C57 2D4524754DC4768E5467 I 471,7)E5752457476FA6472782972474744C9*74974467C973456E97E57A73E947 8E9454984 7 >6F2 7 4 7 AEE 7 C5B 7 29B 7 4 7 E3 7 C5B 7 254 7 544484B 7 29B 7 288434B7 546348CD47CEF7)F8*7F5457BC68F66CE97473234567BC68F664B72547E647C9747CBB47 E74752947E574547271,74A457446765E97274732345749B4B7F37C97475E97 AC97E5745474754DC4768E5467BC4576C9CC8297123456727E975484CD4B7E754DC467 254 7 26E 7 E49 7 BC68F664B 7 2A4 7 C 7 2 7 FC8 7 54DC4 7 A 7 E94 7 E 7 4 7 1, 7 4A456 7 267 2BBCCE927A285EF9B7475CE57E7471,744C97B4349B67E97476C54729B7543F2CE97E7 478E94549847 976E47E56E36729B78E945498467471,782C567274724747C927 B48C6CE97464D467CEF7C9DEDC9747E471,7
34
A8
1234567689A96
2BCDEFB6BF
77,E95CAFE567!7C6732478E92C967245C2735EDCB4B7A7;2C7T2C6457,52C71255CB47(FC71E7 ?5C87(C447(27(EE7=F82754DC6297C48C27C4C9C7?5457E2BE79 EA#5D969,954F7 4725C847267A44972B234B75E 7(3AAA 6,1D25AA33768-58A 6'31DA14265812379 69E467E97 489C82 7 323456 7 C97 6646 7 29B7 94E56 724 7 254 729F 7 E 7 4 7 2FE56 7B 78E95CAFE56 7E 7 C67 BE8F49 7(EF5847C9!73!BB868EFAC24BFBS6B48B5CC9+649
34
A
1234567689A96
2BCDEFB6BF
12345678529A2BACDAE226FE22658F 56C6CBD6D6B6FDBBC6DE6666DBD6D6DDCDDC6DF66B6CBBB6CE ! BC6D6456B6EF6EB6FDBCC"6B6BF"66"6DEBF6#CD"6 B6FDBCC"6DFD6FBFB#"6BF6FBDD"6CBB6FBDD66B6CCBC$6 3B 6 BCFD 6 D 6 B 6 DD 6 C 6 BB 6 DB 6 FD 6 B 6 DB 6 6 # B 6 D 6 B 6 FBCB#B6 B CBC$6%B6#B6FB6D6DFDFB6B6FBB6# B6DB6CDEFB6DDC66DD6DE6 CEDF6F6FD6DEBDC66BBD6EBC6# B6CD66DE666BC6D6 B66DEF6DF$ 4D6B66CDFBC6D6C6EFB6# B6FBB"6B6#B6CBBB6DCB666B"66 &'4& 6 (E " 6 #B 6 CDB 6 DC#B 6 B)BFBB$ 6 %B 6 6 D 6 B 6 C 6 D 6 6 DFBBC#B6 DD6D6ECBE6DDC6DF6456FBCBF$63D6BCEFB66B6D6D6FB6*EC6D6B 6FBDFC"6B6 FB+EBC6DE6D6DB6CDFBC66DE6#B6ECB66DE6ECBE$63C6DD66 B6 # B6D6B6,5-456B CB$ ./&,63B"6&'4&6(E
73A2BA8A8226A %B 49AF2AA 0FBB661B6CDEFB6CDFB 24587A C446576A59A '6 9529398AB2A 2#6D6%DC6DF63E)6DF6(4&61,4 F7A F548529A %B 6 C 6 6 DBD 6 D 6 B 6 BF 6 DFC 6 DF 6 6 6 CC$63B6DFC66BBF6 B6B6FD6DEF6D62#6DB6DF6 6 B6B6FB6D66CB6EC66CB!D!ECB6-556FD#B6 6B6CDFB$6 56C6B#BDB6 65#BFC6D6%D$ %B 6 C 6 BF 6 6 B 6 B 6 D 6 B 6 BF 6 DFC6 BBB$ 6 5 6 C 6 6 DD 6 DD 6 DF 6 BDB 6 D 6 FB 6 B 6 D 6 B6 BF6EB6D6C6ECBF!FB6-5566# 6D6 C6CBC6 B6DBBF$656CDF"66C66DBFE6DD6DF666B)BFC6 D6D6CDF6B6FB+EFB66D6B6B)BFB6FBCEC$63EC"6 6%B6DE6*EC6BB6D6DEC6D6DEF6DFB6FD B6DE6DFF6 DE6BBD6D6B6B6BF6B+EBC$6 3D6CF66BCB6BFB6B+EBC6D6DEF6D6"6DE6*EC6 BB6D6FBFBCB666B6DF6CBB6 6%B$6 239878529A79!2A 4F 6 FD 6 B 6 ECE 6 DEBD" 6 # 6 D 6 6 DD 6 61232 4 1233958A"4428 5676789 4 AB2C36C2D 4 52CE67F 4 F2B7678 4 D 4 27 4 FCE76F 4 27 4 4 633F7 4 6F 4 B27 4 ! 4 5B827 4 "2##$277%6 6 6 # F 6 DE6 CEDF6BC666FB6B)BFBB6D6DF66C6DBFE6B6CB6 66CDFB$6 #59FA -7568E 63BCB 453
A
1234567689A96
2BCDEFB6BF
$2AD9B2378529A78A 0DF6DFB6DFD66DD6BCB6#C96 9$C$D$$:;BB)$
73A2BA8A8226A 733.6!6<7EF63EB63DD= 49AF2AA 0FBB661B6CDEFB6CDFB 24587A C446576A59A 7EF63EB68FDBCC 9529398AB2A 8D6D6%DC6DF63E)6DF6(4&61,4 F7A F548529A 733. 6 6 C 6 6 DBB 6 CEB 6 D 6 D 6 DEBC" 6 EC 6 6 6 DEBD 6 DF 6 FBCBF 6 6 B#BDB 6 6 7EF 6 3EB6 8FDBCC66738>$656FD#BC6EBC6DF667386CC6B6E"6 CB66FC6D6B66B$656C66ECBE6DD6DF6CEBC"6 FDBCCDC66FBCBFBFC6BFBCB6673866C6B6ECB6 6 7386DE66D#BF6B6DF$ (6 C6C6B6C6BCCB6CC6673866 B6FBDFB6EC6 733.6BC6#BC6DFB6B6D6DEF6DFB6FD B$ 239878529A79!2A &23B2D427828F4ABCF6784'63E4A(3E74(4)3F*F74+6B4'274"DF67 4 1233958A"4428 274'2B4,FB4-./F6DD(45F62 4C66DD6 DD6D6CF66733.6 67EF6EB6FDBCC66BF$ #59FA 4B63BCB68$9 $2AD9B2378529AA78A 0DF6DFB6DFD66DD6BCB6#C96 9$$DF
73A2BA8A8226A 1B&?6!6<496,DEFB61DEBF6%CD= 49AF2AA 0FBB661B6CDEFB6CDFB 24587A C446576A59A &DEBF6?CD 9529398AB2A &@@"6&6DF68D6D63E)6DF6%DC F7A F548529A 1B&?6C66 FF6D6FDF6EDC6DF6FB6B6DEBF6 #CD$ 56C6DFB66A996D:B6DFC$6 1B&? 6 C 6 FBB 6 DF 6 D 6 B 6 6 DBF 6 ECB$ 6 3B 6 ECB 6 D6 1B&?6FBC6FD6BF#B6F"6D6B6CBD"6C6C6 D6B6B 6D6FDE6#B6FD DC$ 1)B 6 DC 6 D 6 B 6 1B&? 6 FF 6 FB 6 BE!&DEBF6
453
8
1234567689A96
2BCDEFB6BF
5BFD66B&5>C61 *B65BD"6,BBD662BDDC6 0B62BDDC6-BCEFB62BDDC6(DD63F"61D6(DD"6 (DD65BFCC6,FEEFB60FD6(DD66,0(>C6,BFBD66(E! &BF6& FD66'B6&DEDC6(D B62D DC$ 239878529A79!2A 3B61B&?6 DD6C6! 1233958A"4428 0F2B76784-,F712941$,3FB426674'63E43EF4-,F71246B2B(34 442B(4+B2 642745B6274"2FEDFB4(4-.BF6DD( 4CD 6 D3B 6 1B&? 6 %E6 69DB#$DFB$D0E1B&?%>6 C6B61B&?6DEFC6$6 #59FA /,'6BCB $2AD9B2378529A78A (DFB6BC6 DE6C6DD66 B6DE6 9DB#$DFB$D
73A2BA8A8226A (B676<(4B63BF6DF63E163DD= 49AF2AA 0FBB661B6CDEFB6CDFB 24587A C446576A59A ,C6EF6EB6FDBCC 9529398AB2A 2#6D63E)66%DC6 F7A F548529A (43313 6 C 6 6 *#! CB 6 B 6 DF" 6 DEB 6 CCD"6 ECBF" 6 D 6 DB" 6 DFD 6 B)FD" 6 6 DBF 6 B6 BF6DC6D6B)$656EBC6CDCB6DDC6DF6!2398A 67FF5B57852996BB6FDEBC6DF6D#BF6B)6D6GBEFBCG"66B6 #FB6D6DFC66E67H#B6/BC"6()E61FD"66 'BCD 6 3FBBC>" 6 6 DB 6 DF 6 B#E 6 CCBF 6 BFDFB 6 EC6 CB#BF6DD6ECB6BFC$ 5 6 D 6 D 6 CCD" 6 (43313 6 EBC 6 DDC 6 DF 6F&9A 87596 DF 6 DC 6 CE 6 C 6 B!B 6 B)FD 6 FD 6 B)$6 4DFC6EB6BB6(FD#6(DBC"6()E61FD6(FD#6 (DBC" 6 6 &DD 6 2D 6 0BC$ 6 3BCB 6 BDC 6 FB6 BBB666B)BC B6CCB6DF6B6CB6FCEBFC$ 3B6(4331368245A32!6596DD6DC6BB"6C! CB6 BBDC6D63B6'FB64DD"68D64DD"66 BBFF63'4$ ( 6 D 6 B 6 DFC 6 6 (43313 6 BB 6 D 693576A 248535'78529$6(433136EBC66BB6BBD6D63B6 (BDF6/0-,"6D66DBF6D:D6BDC$
453
I
1234567689A96
2BCDEFB6BF
5 6 D 6 D 6 CDCB 6 B 6 BF 6 DC" 6 (433136 EBC 6 FDEBC 6 DF 6 FCDF 6 B) 6 DEBC 6 D 6 EBF6 FBFBCBDC 6 6 6 B 6 B 6 FDBCCB 6 BB$ 6 3C 6 FDBCC 6 C6 BBB6FDE66B) B6CCB6D6GBCG"66B6C6 CC 6 CE 6 C 6 DB: 6 CFC" 6 FBD# 6 CDDFC" 6 6 D#BF6 CB+EBBC6D6DE6#BDFC$ 239878529A79!2A 9B$C$ECC$BEC$ 1233958A"4428 #59FA 2BBCB6EBF6B6&DD68E 63BCB $2AD9B2378529A78A (DFB6BC6BCB6FBBF969B$C$ECC$BE
73A2BA8A8226A 8/F676D!(CB6)BDFBB63BF"6CF6DBBB6 6BEF67BDF 49AF2AA 0FBB661B6CDEFB6CDFB 24587A C446576A59A (B 6 3BF 6 6#:$ 6 7BEF 6 BDF" 6 2BDFBB 6 3BF"6 ,EBF#CB5CEBF#CB6BF>6 9529398AB2A %68D6D6(4&"6%DC663E) F7A F548529A 8/F 6 DC 6 DFC 6 DF 6 BEF 6 BDFC" 6 DF 6 FBDFBB6 BF666B6D D6D6B6D>"6DF6ECEBF#CB6BF"66 B#DED$6,B6DC6D6B6EFFB6FD BC6B66DEDEC6CB6 6D6CBC"6ED6FD)DFC66B6BEF6BDFC>6EC6 B6 ECB6D6DB66B6FB6BCD$63C6 FF6C6 E6FDE6 BEF6BDFC66B6BFB666D6B6F6BDC6B66 BEF 6 BDF 6 C 6 B 6 D! B!FB 6 CB$ 6 3C 6 BC 6 8/F 6 6 DBFE6DD6DF6FB!B6CC$6 8/F6DBC666FBBB6 C6B#FDBC$63B6FB6CBFDC6 DF6BC6CBC"6666B66 B6FB6DF66DF66 B6 BCB$ 6 3BCB 6 B#FDBC 6 CFB 6 6 DD 6 BFB" 6 EC 6 6 6 #BF6BC6D6C6BC$6 56C6D6 B66#BF6BC!D!ECB6DEF6 FF666 B6ECB6 6BF! B#B6CEBC6 E6C6DBFC6B6B) 66DFC6DF6CB!D! B!F6FBCBF$ 239878529A79!2A 'DEBD66969$ F$DFDC 1233958A"4428 #59FA /,'6,DFB63BCB $2AD9B2378529A78A %6B6DF68/F6969$E $D F F 0DF6DD6969$ F$DFBCDD 453
J
1234567689A96
2BCDEFB6BF
73A2BA8A8226A &(56!6,) 49AF2AA 0FBB661B6CDEFB6 24587A C446576A59A ,BB62BDD 9529398AB2A &662#668BF668D6D6%DC"63E)"61,64 F7A F548529A ,)!J6C66CB!D!B!F6CBB6FBDD6CCB6FB6BFB6 6B62#3(6FDF6EB$656C66#BF6B) B6CCB6 B6 D6BFDF66BFB6BC6D6FBDD6CC$656C6 B6D6 FBD:6CFBB66DEDEC6CBB66C6#B6DB66 6 DB6CBB6FBD:BFC$64D"6,)!J6C6 B6D69 1 BBF:B 6 FD 6 B 6 FBEFB 6 6 EBC 6 E B6 BBDC6D6FBBCC"6B6D"6003"6(B6 FB+EB6 BF6 "6 CFBB 6DCB 6FCDF" 6BCF6 B6 DF:D" 6 6 BEFB 6 B)FD 6 D 6 BCF" 6 B 6 BCF"6 DE B6B6BCF6BEFBC$ 1 BBF:B 6 E B 6 EB 6 DB 6 FBEFB 6 EBC6 E B6EB6DB6CEDF6DF64,&5566 F6#BFCDC6 D6EF"6 F"6FF"62#6,BB64856-FF60DF6 62,-0>"664284!DF60,36FFC$ 1 BBF:B 6 DEC 6 DB 6 FBEFB 6 E 6 E B6 CEDF6DF6,)!I6DEC6DBC$ 1 BBF:B6CBF6BB66EBC6E B6CEDF6 DF6 FB6FC66DF6FE6CBFBC$ 1 EBC 6 DF 6 DC!FDBCC 6 FBDD 6 FBCEC" 6 E6 D 6DBB6CDFBC"6BBF6BC66B B6 1&(4,F6D62,-06C$ 239878529A79!2A 9EC)$CDEFBDFB$BBF 1233958A"4428 0DF6B6BECCC"6&(56,)!J6C66CFD6DED6BDF6 66 B6DE66BF6CB$ #59FA /,'63BCB $2AD9B2378529A78A 0DF6DD6969EC)$CDEFBDFB$BDD (DFB 6 6 B 6 BF 6 9EC)$CDEFBDFB$BC)JKLCLC)J
6
96
73A2BA8A8226A 7E66<62BB6?BFCD696A$9C6FBBCB6D68I6(F6899M6= 49AF2AA 1B6CDEFB6 24587A C446576A59A %B 6,BF"6'62BFB#"65FB665BFB6&F"65B)"6 NEBF"6&ECBF
453
A
1234567689A96
2BCDEFB6BF
9529398AB2A /E6EC6B62#6EBC68DF661,>6BBB6 F7A F548529A 7E6C66DB6CDEFB6B !CBF6CDFB6 E6D63EBB62#$656 C6B !CBC"6CE6C66FBF"66!F6 CB"6FCBFC6DF6 B3(366DBF6DEB6DFC"6B$67E6FD#BC6B6 6D6 F 6 6 B) 6 B 6 DEBC$ 6 5 6 FD#BC 6 BB#B 6 B) 6 6 CBF 6 BCC$ 6 7E 6 C 6 6 6 DEF 6 FBEFB 6 6 DC6B#BDBFC6D6FBB6E!C6DF6B6DD6#BC96B! B 6 FC" 6 6 FBFB#" 6 +EBF 6 6 ECBF$ 6 5 6 C6 BBDC6DF6EB6B6DF6FCBFC6D6FCB6B6DB6D6 B 6DEBC6E6CB6DDDBC$6 7E6FD#BC6BCC6DF65236BF665236DF:D6 EC6FBEF6B)FBCCDC$656DFC66B338660386FDDDC$67E6 C66BF6BFB66FBFBCBD6BC$656C6EBFDEC6 DEF B 6 FBBFC 6 DF 6 B" 6 FC" 6 B)" 6 C"6 CDF" 6 CBF 6 6 ECBF 6 B 6 DEBC$ 6 7E 6 FBCBC 6 B6 FD DC 6 B)ECD 6 CFFDDD 6 $B$ 6 B 6 FD DC$) 6 FDDD 6 BFB 6 BF6D6B6B CB6DBBCC6DBC6D6#D6D#BFD6B CBC$6 7E6C66BC6D6FB6CDEFB6DB66BC6DD66 D6D6F66B)6BCC6D6DFC6D6DBFC6 DB$ 6 7E 6 BBC 6 B 6 E!B 6 FDBCC 6 BBC 6 6 6 BBD 6 D 6 -DDBFC 6 (2BEB 6 6 6 6 CF EB 6 B6 CCB 6 EC 6 4B 6 BDD$ 6 7E 6 C 6 BB 6 CEBCCE 6 ECB 6 DF6 F6DFB666EFB6D6BC"6D6BFB6FBEFBC"6 BFB 6FD#6C6BC6C66C B6CCB$6 1)C6 E! 6 CB 6 EBC9 6 8'0/D) 6 DF 6 B)F 6 B)E 6 DB6 6 B 6 FD 6 BFB 6 8'0 6 BC" 6 B8FC 6 2B 6 DF 6 1DDBC"6 &FFD8 6 DF 6 &ECBF 6 6 2B!3 6 8FCBF5B)BFNEBFBF 6 E! 6 DF6 (FDDFC$6 239878529A79!2A 7E6C6CEDFB6 66#B64B6DE$6 1233958A"4428 4CD 6 BFB 6 C 6 6 E 6 DE 6 6 6 B 6 DE 6 6 96 9$B$DFE 3C 6 DF 6 B 6 04N" 6 ( 6 CC" 6 ,DEFB 6 #BFCD 6 DFD 6 CCB"6 BFD:D"64856DC66 B6DE66B6CB$ #59FA 4B63BCB68$9"62EF6899J6 $2AD9B2378529A78A 16%B CB6969EBB$B$DFE
73A2BA8A8226A 335!3 6<3BFCE6EBF63BCB65DF6!63 FF= 49AF2AA 0FBB661B6CDEFB6CDFB 24587A
453
O
1234567689A96
2BCDEFB6BF
C446576A59A 5B6FDBCC"6&DEBF6?CD"68BF62BDD 9529398AB2A 3E)66%DC$ F7A 0DF6%DC6736DE66BB66BC6B6(,6?CE6&@@6O$966 ,BF#B686A$96DF6B6(,6?CE6&@@6$7136899I66?BFCD6$713689986 C67136CEDFB>$ 0DF63E)6DE66FB+EFB66BC6-&&68$MA$I$ 0DF 6 D 6 DFC 6 DE 6 6 BB 6 8123" 6 6 6 DE 6 6 D 6 ECB 6 B6 #BBFC"6DE66CD6BB6D6#B6CB6B6-3.6 FFBC$6 F548529A 3B 6 335!3 6 C 6 6 D *B 6 DFBB 6 FF 6 6 DFC 6 6 6 CFEEFBC6FB+EB6ECB66B6FDBCC66DEBF6#CD$656 C 6 BB 6 B#BDB 6 6 B 6 &F 6 D 6 3B 6 &DEBF 6 ,BB6 63BFCE6EBF63BCB65DF>633566B64B65#BFC6D6 3BDD"6C6F6D66FBCBF6FD*BC66DEBF6#CD6B6 6 FD DC" 6 D *B 6 FBDD 6 6 C 6 EB 6 6 BCEFB6 FBDD$ 4D6BEFBC9 1 1 *B6DFBB6&@@ 1 5B68FDBCC64DFC 1 5B64CC64DFC 1 8BF62BDD60FBDF 1 (ED *B#B61#ED60FBDF6DF64DFC 1 BFFB64856DF6CBF6DFC"6B C"6FBFB"6B 239878529A79!2A 'DEBD6EC6'D)B696 1233958A"4428 9 $CDEFBDFB$BDB)$C 3BFB6C6CD666DF6ECBFC66B#BDBFC6D6335!3 $ #59FA 3-83 $2AD9B2378529A78A 36969 $CDEFBDFB$BDDBBB)$C
73A2BA8A8226A 24'166<2#64B6'1#BDB60FBDF= 49AF2AA 0FBB661B6CDEFB6CDFB 24587A C446576A59A 4B6,CBC 9529398AB2A 2#6D6%DC"63E)$ F7A F548529A 24'16 62# 6 4B 6 '1#BDB 6 FBDF> 6 C 6 6 BFB 6 DF 6 B6 B#BDB6D6DC"6 D66B6D B66)B6B#FDB"6 CB6D6B6BBF!D!BBF6BB6EDDDEC6B6FD$624'16 B BC 6 B#BDBFC 6 D 6 BB 6 6 BD 6 E!B 6 CCBC"6 E 6 BC 6 FE 6 D 6 FBBCC 6 BDFC 6 6 B!FBCDEFB6 453
P
1234567689A96
2BCDEFB6BF
B#BC$6QDE 66B#BD6E!B6CCBC66DB66B6 05846CBDC$6 ,DB 6 C 6 BEFBC 6 EB 6 FB 6 BC" 6 B 6 CC" 6 B6 DED"6B6CD#BF66-55C6FB6D#BFB"6C6B6C6DFB6 #B6BEFBC6E6DDDBC66DB6EBC"6DB)6 B#DEFC" 6 BFD 6 FDDDC" 6 B 6 D " 6 6 B 6 !FDBCC6 BFB$ 3BFB 6 FB 6 6 D 6 D 6 F!F 6 CDFBC 6 # B 6 6 24'1 6 6 FD#B 6 BFBC 6 B 6 BBC 6 D 6 ED$ 6 1)BC 6 D 6 BCB6 EB63B"624'1!1,-"6%,5-"6%,'&"6,BC"624'1!4FD$ 239878529A79!2A 8F6EB6D6EC624'169 6D'B#BD6(E!B6CCBC66 1233958A"4428 24'1E6!62646864+FDD6#F$67F446*27764126BF41$676C44BFF7' 4 6DF(4AD6C2367 3B6DE6C6#BF6BE66B6EDFC6D6B6 DD6BCB#BC6 #B6F66B66DEDC666DC6 D6B6+EBFBC6FD6ECBFC6DF6B#BDBFC$ #59FA 3-83 $2AD9B2378529A78A 9*B$ $D
8AF2B6AF58F 3B644456CB6C66B)EC#B6DBD6D6DDC6F6D#BF66#BF6B6CBFE6D6459 9$$DF453DC$453DC,DFB (31,,66(B63BF61B6,DEFB6,DFBC696 9DCC$DFCDFB QDE6666D6D6DBF6B6BF6DDC6BFB66(31,,$
453
R