@article{belkin-niyogi,2002lea,
  title={{Laplacian Eigenmaps and Spectral Techniques for Embedding and Clustering}},
  author={Belkin, M. and Niyogi, P.},
  journal={ADVANCES IN NEURAL INFORMATION PROCESSING SYSTEMS},
  volume={1},
  pages={585--592},
  year={2002},
  publisher={MIT; 1998}
  annote={"This paper presents a technique for non-linear embedding high-dimensional 
           datasets  into low dimensional space. The method works by first contructing 
           adjacency graph by putting an edge between xi and xj if they are "close" (KNN
           or ||xi-xj||^2 < e).The weights may be assigned in 2 ways. If we want the 
           algorithm to be simple then we give weight 1 to edges between connected(by an 
           edge) vertices  else 0. If we want our model to be more informative then we use
           the gaussian heat kernel. Next we calculate the Laplacian matrix L= A-D, where 
           A is the adjacency matrix and D is the degree matrix. The solution of a 
           generalized eigenvector problem involvong L and D gives eigenmatrix from which 
           we pick first d vectors inincreasing order of their eigen values, starting from
           the second vector. - Kushlendra Mishra, Feb 09."}
}




@article{shlens2003tutorial,
  title={{A tutorial on principal component analysis}},
  author={Shlens, J.},
  journal={Copy retrieved [10-27-2004] from: http://www. snl. salk. edu/shlens/notes. html},
  year={2003},
  publisher={Citeseer}
  annote= { This article is tutorial of Principle Component analysis. Excerpts:
            
            The goal of PCA is to compute the most meaningful basis to re-express a noisy 
            data set. The hope is that this new basis will filter out the noise and reveal
            the hidden structure.

            PCA makes one stringent but powerful assumption:linearity. Lineariity vastly 
            simplifies the problem by(1) restricting the set of potential bases, and(2) 
            farmalizing teh implicit assumption of continuity in the data set. With this 
            assumption PCA is now limited to re-expressing the data as a linear 
            combination of its basis vectors.

            PX = Y 
            
            The above equation represents a change of basis and thus can have many 
            iterpretations.
             1. P is a matrix that transforms X into Y.
             2. Geometrically, P is a rotation and a stretch which again transforms X 
             into Y.
             3. The rows of P, {p1,...,pn}, are a set of new basis vectors for expressing 
             the columns of X.

            The covariance measures the degreee of linear relationship between two 
            variables. A large (small) value indicates high (low) redundancy.
         
            cova(A,B) >= 0
            cova(A,B) = va(A) iff A = B

            The covariance matrix captures the correlations between all possible pairs of 
            measurements (data points). The correlation values reflect the noise and 
            redundancy in our measurements.
             -> In the diagonal terms, by assumtion, large(small) values correspond to 
                interesting dynamics (or noise)
             -> In the off diagonal terms large (small) values correspond to high (low) 
                redundancy.

            Our goals are 
             (1) to minimize redundancy, measured by covariance, and
             (2) maximize the signal, measures by variance.
            Evidently, in an "optimized" matrix all off-diagonal terms in cova(Y) are 
            zero. Thus cova(Y) must be diagonal.

            In
        
            PX = Y
      
            P acts as a generalized rotaion to a align a basis with the maximally variant 
            axis. In multiple dimensions this could be performed by a simple algorithm:
             1. Select a normalized direcction in m-dimensinal space along ehich the 
                variance in X is maximized. Save this vector as p1
             2. Find another direction along which the variance is maximized, however 
                because of orthogonality condition, restrict the search to all directions 
                perpendicular to al previous selected directions. Save this vector as pi.
             3. Repeat this procedure until m vectors are selected.

            The resulting ordered set of p's are the princile components.

           Asumptions made for PCA:
             1. Linearity: Linearity frames the problem as a change of basis.
             2. Mean and variance are sufficiant staistics: It captures the notion that 
                mean and variance entirely describe a probability distribution. The only 
                class of distributions that are fully described by the first two moments 
                are exponential(e.g. Gaussian, Exponential, etc.).
             3. Large variances have important dynamics.
             4. Principle components are orthogonal.

           Performing PCA is quite simple in practice:
             1. Organize a data set as m*n matrix, where is the number of measurement 
                types(dimensions) and n is the number of trials (data points).
             2. Subtract of the mean for each measurement type. or row xi.
             3. Calculate the SVD or the eigenvectors of the covariance.

           Both the strength and weakness of PCA is that it is a non-parametric analysis. 
           One only needs to make the assumtions outlined above and tehn calculate the 
           answer. No parameters to tweak and adjust the answer.
                
}
}






@article{saxena-gupta-mukerjee:2004nld,
  title={{Non-linear Dimensionality Reduction by Locally Linear Isomaps}},
  author={Saxena, A. and Gupta, A. and Mukerjee, A.},
  journal={LECTURE NOTES IN COMPUTER SCIENCE},
  pages={1038--1043},
  year={2004},
  publisher={Springer}
  annote={"This paper presents improved variant of the Basic Isometric feature mapping 
           algorithm for manifold learning. Basic Isomap suffers from a serious 
           shortcoming of short-circuiting which means that when the distance between two 
           folds is very less or there is noise which causes two points in different folds
           to be classified as neighbors, then the distance computed in step 2 of the 
           algorithm are not the geodesic-distances, hence the algorithm fails. The 
           proposed algorithm first computes the candidate K nearest neighbors. Then these
           neighbors are usede to reconstruct the data  point. The weight for each 
           neighbor is estimated by reudcing the reconstruction error. The neighbors which
           are in locallly linear patch of the manifold get higher weights. Out of these K
           neighbors, Kll with higher weights are selected. Thus this algorithm uses two 
           parameters, K and Kll. The performance of this algorithm is compared with with 
           basic Isomap for three calsse of data. For all the test datasets, the new 
           algorithm didn't suffer from short-circuiting and had lesser residual variance.
            - Kushlendra Mishra, Feb 09"}  
}



@inproceedings{tenenbaum-silva-langford:2005,
  title={{A Global Geometric Framework for Nonlinear Dimensionality Reduction}},
  author={Tenenbaum, J.B. and Silva, V. and Langford, J.C.},
  journal={Science},
  volume={290},
  number={5500},
  pages={2319--2323},
  year={2000}
  annote={ "This aricle presents the basic Iosomettric Feature Mapping algorithm. The 
           algoritm works in three steps: calculating distance between pairs of pointsand 
           find out the neighbors of each point on the manifold, secondly, these 
           neighborhood relation is represented as a weighted graph, the third step 
           applies classical MDS to find a low dimensional isometric embedding of these 
           points. One advantage of Isomap is that it gives an estimate of the 
           dimensionality of the manifold. -Kushlendra Mishra, Feb 09"} 
}



@article{martin,backer:2005,
  title={{Estimating Manifold Dimension by Inversion Error}},
  author={Martin, S. and Backer, A.}
  booktitle = {SAC '05: Proceedings of the 2005 ACM symposium on Applied computing}, 
  annote= {
           The article is about accurately estimating the dimension of a manifold embedded 
           in a high dimension space. The method works by first calculating the inversion 
           error of the low dimensional (d) mapping. If d is the correct dimension of the
           manifold then this error should ideally be 0. Same is true for a mapping of 
           dimension greater than d.Practically however, it is neve zero so minimization 
           is attempted. Support vector regression technique is used to find out an inve-
           rse mapping. This is a general technique of estimating the dimensionality of a 
           manifold and works for any algorithm such as LLE and ISOMAP."}
}


@conference{carreiraperpinan2005pgc,
  title={{Proximity graphs for clustering and manifold learning}},
  author={Carreira-Perpinan, M.A. and Zemel, R.S.},
  booktitle={Advances in Neural Information Processing Systems 17: Proceedings Of The 2004 Conference},
  year={2005},
  organization={MIT Press}
  annote={ 
           In graph based algorithms for manifold learning, the graph constructed from the 
           nearest neighbor data doesn't reapresent the manifold well enough. This paper 
           presents two graph construction algorithms from nearest neighbor data. First is
           PMST ensemble which uses a perutrbed dataset to first find the edge weights and 
           then finds a MST. the graph is an ensemble of such spanning trees. The second 
           method DMST applies kruskals algorithm on the same dataset, but the edges are 
           not replaced, deriving an ensemble of MSTs. -Kushlendra Mishra
} 
}



@article{saul2003tgf,
  title={{Think globally, fit locally: unsupervised learning of low dimensional manifolds}},
  author={Saul, L.K. and Roweis, S.T.},
  journal={The Journal of Machine Learning Research},
  volume={4},
  pages={119--155},
  year={2003},
  publisher={MIT Press Cambridge, MA, USA}
  annote={"This paper gives a detailed description of the LLE algorithm. LLE tries to 
           reconstruct the manifold by itegrating locally linear patches. First nearest 
           neighbors of each sample point are found and the point is represented as a 
           convex sum of its neighbors. Weight matrox is found that minimizes  the 
           reconstruction errors. This weight matrix contains local geometric information 
           about each point, and is used to find a low dimensional embedding of the 
           manifold. The implementation and extension of LLE given in this paper is a bit 
           involved and difficult to grasp in the first reading. 
           -Kushlendra Mishra, Feb 2009"}
}







@article{srinivas1994mou,
  title={{Muiltiobjective Optimization Using Nondominated Sorting in Genetic Algorithms}},
  author={Srinivas, N. and Deb, K.},
  journal={Evolutionary Computation},
  volume={2},
  number={3},
  pages={221--248},
  year={1994},
  publisher={MIT Press}
  annote={ "This paper introduces the basic NSGA algoritm. The earlier methods for 
           multiobjective optimisation required the user to pass a parameter in the form 
           of a weight vector,   which was a difficult problem. The NSGA only required a 
           sharing parameter. The simulation tests compare the performance of VEGA and 
           NSGA, in VEGA the set of non-dominated solutions tend to cluster around some 
           regions, while NSGA always gives smoother pareto-front. Also keeping the 
           sharing parameter values to the extremes gives poor performance. 
           -Kushlendra Mishra, Feb 2009" } 
}




@article{deb2002fae,
  title={{A fast and elitist multiobjective genetic algorithm: NSGA-II}},
  author={Deb, K. and Pratap, A. and Agarwal, S. and Meyarivan, T.},
  journal={Evolutionary Computation, IEEE Transactions on},
  volume={6},
  number={2},
  pages={182--197},
  year={2002}
  annote={ "This article describes the improved version of the basic NSGA, the NSGA-II. 
           This algorithm has overall time complexity of O(MN^2) whereas basic NSGA had a 
           complexity of O(OMN^3). The paper introduces some innovations, such as Crowding
           Distance,Crowded-Comparison operator based on the former.The simulation results
           of this algorithm show a better performance incomparison to other contemporary 
           elitist algoritms PAES and SPEA.Two mertics have been defined for comparison, 
           the distance metric and the diversity metric. It also gives a way of 
           integrating constraint handling in NSGA-II."}
}


@article{zeleny1998multiple,
  title={{Multiple criteria decision making: eight concepts of optimality}},
  author={Zeleny, M.},
  journal={Human Systems Management},
  volume={17},
  pages={97--107},
  year={1998}
  annote={ excerpts:
           Because there are no tradeoffs along a single criterion, optimality is 
           essentially a multi-criteria concept.The real optimality, being distinct from 
           above simple computations, involves balancing multiple criteria and their 
           tradeoffs. This corresponds to the vector optimization of
           Max $f_1(x)$, Max $f_2(x)$, ... and Max $f_k(x)$, i.e., simultaneously and 
           subject to 
           x<~ X. 
            
           The concurrent maximization of individual objective functions should be 
           non-scalarized, separate and distinct, i.e., not subject to aggregation, like 
           forming a maximizing superfunction U{$f_1(x)$, ...}. Such an aggregation will 
           effectively reduce multi-objective optimization to a single 
           objective-optimization.

           Human decision making is obviously characterized by parallel search for both 
           the criteria and alternatives in order tiachieve their optimal interaction and 
           thus arrive at an optimal problem statement and it's optimal solution. Neither 
           the criteria (f or F), nor nor the alternatives are given in genuine 
           decision-making situations. There is a problem formulation representing an 
           "optimal pattern" of interaction between alternatives and criteria. It is this 
           optimal, ideal or cognitive equilibrium problem formulation or pattern that is 
           to be approximated or matched as closely as possible by decision makers. 
           Single-objective matching of the cognitive equilibrium is the simplest case.
           
           Pattern matching with multiple criteria is more involved and so far the most 
           complex optimality concept. In all "matching" concepts there is a need to 
           evaluate the closeness (resemblanceor match) of a proposed problem formulation 
           (or pattern).


@book{bransford2003people,
  title={{How people learn: Brain, mind, experience, and school}},
  author={Bransford, J.},
  year={2003},
  publisher={National Academy Press}
  annote={ Excerpts from second chapter:
           Research shows that it's not simply general abilities that differentiate 
           experts from novices. Instead experts have acquired extensive knowledge that 
           affects what they notice and how they organise, represent, and interpret 
           information in their envioronment. This in turn affects their abilities to 
           remember, readon and solve problems.

           Key principles of experts' knowledge:
           1. Experts notice features and meaningful patterns of information that ara not 
              noticed by novices.
           2. Experts have acquired a great deal of content knowledge that is organised in
              ways that reflect a deep understanding of their subject matter.
           3. Experts' knowledge cannot be reduced to a set of isolated facts or 
              propositions but, instead, reflects contexts of applicability: that is, the 
              knowledge is "conditionalized" on a set of circumstances.
           4. Experts are able to flexibly retrieve important aspects of their knowledge 
              with little attention effort.
           5. Though experts know their disciplines thoroughly, this does not guarantee 
              that they are able to teach it to others effectively.
           6. Experts have varying levels of flexibility in their approach to new 
              situations.

           DegRoot concluded that the knowledge acquired over tens of thousands of chess 
           playing hours of chess playing enabled chess masters to out-play their 
           opponents. Specifically, masters were more likely to recognize meaningful chess
           configurations and realize the strategic implications of these situations; this
           recognition allowed them to consider sets of possible moves that were superior 
           to others.

           The superior recall abilities of experts, illustrated in the example in the 
           box, has been explained in terms of how they "chunk" various elements of a 
           configuration that are related by an underlying funtion or strategy. Short term
           memory is enhanced when people are able to chunk information into familiar
           patterns.

           Chess masters percieve chunks of meaningful information, whichaffects their 
           memory for what they see. Chess masters are able to chunk together several 
           chess pieces in a configuration that is governed by some specific component of 
           the game.

           The expert circuit technicians chunked several individual circuit components
           (e.g. resistors and capacitors) that performes the function of a typical 
           amplifier. By remembering the structure and funcion of a typical amplifier, 
           experts were able to recall the arrangement of many of the individual circuit 
           elements comprising "amplifier chunks."

           Experts in physics cite laws and ideas involved in the solution of a particular
           problem, whereas competent beginners refer to equations that they will use for 
           a particular problem.
            
           Physics experts appear to evoke sets of related equations, with recall of one 
           equation activating related equations that are retrieved rapidly. Novices, in 
           contrast, retrieve equations more equally spaced in time, suggestiong a 
           sequential search in memory.Experts appear to possess an efficient organization
           of knowledge with meaningful relations among real elements clustered into 
           related units that are governed by underlying concepts and principles.
 
           Experts' knowledge is "conditionalized"--it includes specification of contexts
           in which it is applicabke. Knowledge that is not conditionalized is often 
           inert because it is not activated, even though it relevant.
	}

           
}



@inproceedings{mukerjee2009design-symbols,
  title={{THE BIRTH OF SYMBOLS IN DESIGN}},
  author={Amitabha Mukerjee and Madan Mohan Dabbeeru},
  institution={ASME 2009 International Design Engineering Technical Conferences and Computers and 
              Information Engineering Conference August-September 30-2, 2009}
  annote= {"This article proposes the groundwork by which CAD systems may acquire symbols
           by learnig usage patterns or image schemas grounded in experience.

           Excerpts:
           In human usage, each symbol has a vast range of associations , and any attempt 
           at definition will miss many of these, resulting in brittleness. Human 
           flexibility in symbol usage is possible because our symbols are learnt from a 
           vast experience of the world.    

           In many design tasks, the "good designs" lie along regions that can be mapped 
           to lower dimensional manifolds, owing to latent interdependencoes between the 
           variables. These low-dimensional structures (sometimes called chunks) may 
           constitute teh intermediate step between the raw experience and the eventual 
           symbol that arises after these pattern become stabilized through communication.

           Unfortunately the term "symbol", as it is used in the logic and computaional 
           community is considerably different from its usage in cognitive linguistics and
           in every-day life. In the latter usage symbols are imbued with meaning grounded
           in experience, whereas in the formal usage, it is merely a token constructed 
           from finite alphabet, and is related ony to other terms . If we may present an 
           analogy, the symbol "red" as it is used in a computer, is akin to the 
           understanding of the symbol by a blind man; he knows that it is a "color"
           (whatever that means) and that "green" and "blue" are other colors, and maybe 
           even that "crimson" is a shade of "red" but his understanding of red is 
           dramatically different from that of a sighted person, because the semantic pole
           is not connected to direct experience.
          
           On the other hand "symbol" has come to be understood in cog sci (and also in 
           traditional linguistics) as the tight binding of the psychological impression 
           of the sound( the phonological pole) with mental image of the meaning( the 
           semantic pole).  The human design process is a constant, motivated  exploration
           of the design space e.g. through sketching. All the while designer is focussing
           on the "good" design, and eventually some patterns emerge as teh common 
           characteristics of these designs. These patterns result in constraints whereby 
           many of the initial design variables can be combined, a process cognitively 
           known as chunking.

           It is well known that a designer who is an expert in a particular design domain
           is "confident of choosing a good design based on experience", and this is 
           partly because of these patterns she has internalized. Often, these relations 
           remain implicit, so that the human designer justifies the decision vaguely as 
           "looks right". However, if these patterns do cross the consciousness boundary 
           and get explicitly noticed, the designer may recognize it as an "aha!"  moment.

           By grounded, we refer to the progressive manner in which a human designer 
           learns her concepts - the more abstract ones are based on earlier, concrete 
           concepts, but are still presented thruogh instances.
         


@conference{deb2008deciphering,
  title={{Deciphering innovative principles for optimal electric brushless dc permanent magnet motor design}},
  author={Deb, K. and Sindhya, K.},
  booktitle={IEEE Congress on Evolutionary Computation, 2008. CEC 2008.(IEEE World Congress on Computational Intelligence)},
  pages={2283--2290},
  year={2008}
  annote= {
           A search and optimization procedure on natural evolutions and natural genetics 
           can be an effective mean to arrive at an innovative design for a single 
           objective scenario.


	}
  
}



@conference{hegde2007random,
  title={{Random projections for manifold learning}},
  author={Hegde, C. and Wakin, M.B. and Baraniuk, R.G.},
  booktitle={Neural Information Processing Systems (NIPS)},
  year={2007}
  annote = {
           This paper proposes that a small number N of random projections of sample 
           points in R^N belonging to an unknown K-dimensional Euclideann manifold, can 
           be used to estimate the intrinsic dimension of the sample set. It also proves 
           that using only this random projections, the structure of the underlying 
           manifold can be estimated. In both the cases the number of random projections 
           required is linear in K and logarithmic in N. A greedy algorithm is developed 
           to estimate the smallest size of smallest size of the projection space 
           required to perform manifold learning.


           Data that possesses merely K "intrinsic" degrees of freedom can be assumed to 
           lie on a K-imensional manifold in the high-dimensinal ambient space. Once the 
           manifold is identified, any point on it can be represented using essentially K
           pieces of information.
            
           Grassberger-Procaccia(GP) algorithm is used for estimating the intrinsic 
           dimensionsionality. It takes as input the set of pairwise distances between 
           sample points. It then computes the scale-dependent correlation dimension of 
           the data

}


@article{strube-cognitive,
  title={{Cognitive Science}},
  author={Strube, G.},
  journal={International Encyclopedia of the Social and Behavioral Sciences (IESBS). Oxford: Elsevier}
  annote={ This article is an introduction to the field of cognitive sciences. Excerpts
             
           -It was Leibniz who advocated the use of formal reasoning in the hope that 
           fruitless discussions of all kinds, and even wars, might be deemed unnecessary 
           by formalizing the arguments and computing the conclusions- hopelessly 
           optimistic from our point of view(afterall there would be endless debate on how
           to formalize the premises), but a         milestone, nevertheless, towards an 
           account of thinking as symbol processing. A physical symbol system is a type of
           formal system. Like all formal systems, it has an 'alphabet'(atleast of two) 
           arbitrarily defined symbols, as well as operators to create and transform 
           symbol structures(symbolic expressions and syntactic rules. This system is 
           physical in the sense that it has been implemented in a suitable way. Examples 
           of suitable encoding are levels of voltages or spikes(action  potentials) of 
           neurons. Other representations are also possible, but the aforementioned 
           already show how the theory can be applied to organisms as well as to technical
           systems. In sum, such a system needs physical implementation in order to 
           function in the real world and move from the idealized to the concrete. The 
           type of implementation is arbitrary, however, because the system functions 
           according to its symbolic expressions and syntactic rules, quite idependent of 
           its implementation.

           Symbols are arbitrary signs, but they designate objects or processes, including
           internalprocesses in the system itself. Their symantics is defined either by 
           reference to an object(depending on the respective symbolic expression, the 
           system exerts an influence on the object or is influenced by it), or by the 
           symbolic expression being executed as a program.


}


@article{cayton2005algorithms,
  title={{Algorithms for manifold learning}},
  author={Cayton, L.},
  journal={University of California, San Diego, Tech. Rep. CS2008-0923},
  year={2005}
  annote ={
           Manifold learning is an approach to nonlinear dimensionality reduction. 
           Algortims for this task are based on the idea that the dimensionalityof many 
           datasets is only artificially high; though each data point consists of perhaps 
           thousands of features, it may be described as a function of only a few 
           underlying parameters. That is, the data are actually samples from a 
           low-dimensional manifold that is embedded in a high-dimensional space. Manifold
           algorithms attempt to uncover these parameters in order to find a 
           low-dimensional representaion of the data. A dimensionality reduction procedure
           may help reveal what the underlying forces governing a data set are. For 
           example, suppose we are to classify an emailas spam or not spam. A typical 
           approach to this problem would be to represent an email as a vector of counts 
           of words appearing in the email. The dimensionality of this data can easily be 
           in the hundreds, yet an effective dimensionalityreduction technique may reveal 
           that there are only a few exceptionlly telling features, such as the word 
           "Viagra".

           Given a data set, PCA finds the directions(vectors) along which the data has 
           the maximum variance in addition to the relative importance of these directions.
           For example,suppose that we feed a set of three-dimensional points that all lie
           on a two-dim plane to PCA. PCA will return two vectors that span the plane 
           along with a third vector that is orthogonal to the plane(so that all of R^3 is
           spanned by these vectors). The two vectors that span the plane will be given 
           positive weights, but the third vector will have a weight of zero, since the 
           data does not varry along that direction. PCA is most useful when the data lies
           on or close to a linear subspace of the data set.
            
           A Homeomorphism is a continous function whose inverse is also a continuous 
           function.

           A d-dimensional manifold M is a set that is locally homeomorphic with R^d. That
           is for each x in M, there is an open neighborhood around x, N_x, and a 
           homeomorphism 
            f:N_x -> R^d. 
           these neighborhoods are referred to as coordinate patches. and the map is 
           referred to as coordinate chart. The image of coordinate charts is referred to 
           as the parameter space.

           Isomap. It may be viewed as an extension to Multidimensional Scaling (MDS), a 
           classical method for embedding dissimilarity information into Euclidean space. 
           Consists of two main steps:
             1. Estimate the geodesic distances (distances along a manifold) between the 
                points in the input using shortest path distances on the data set's k 
                nearest neighbor map.
             2. Use MDS to find points in the low-dimensional Euclidean space whose 
                interpoint distances match the distances found in step 1.
                
           Isometric chart; a chart that preserves the distances beetween points.
            
           Euclidean distance between  nearby points is assumed to be a good enough 
           approximation to the geodesic distance between them.

           To perform the estimation of geodesic distance between far away points, the 
           algorithm first constructs a k-nearest neighbor graph that is weightes by the 
           euclidean distances. Then the algorithm runs a shortest-path algorithm 
           (Dijkstra's or Floyd's) and uses it's output as the estimate of the remaining 
           geodesic distances.

           Once these geodesic distances are found Isomap finds points whose Euclideean 
           distances equal these geodesic distances. Since the manifold is isometrically 
           embedded such points exist, and in fact they are unique upto translation and 
           rotation. Multidimensional scaling is a classical technique that may be used to
           find such points.

           One particularly helpful feature of Isomap that is not found in other 
           algorithms is that it automatically provides an estimates of the dimensionality
           of the underlying manifold. The number of non-zero eigenvalues found in cMDS 
           gives the underlying dimensionality.

           Isomap also has optimality guarantee undder some assumptions. Isomap is 
           guaranteed to recover the parameterization of a manifold under the following 
           asssumptions:
             1. The manifold is isometrically embedded into R^D.
             2. The underlying parameter space is convex- i.e. the distance between any 
                two points in the parameter space is given by a geodesic that lies 
                entirely within the parameter space. 
             3. The manifold is well sampled everywhere.
             4. The manifold is compact.

           Many extensions to Isomap have been proposed. C-Isomap has been proposed to 
           handle conformal mappings(conformal mappings preserve angles). Landmark Isomap 
           has been proposed to handle very large data sets. Landmark points are used to 
           speed up the calculations of interpoint distances and cMDS. Both of these steps
           require O(n^3) steps. The idea is to select a small set of l landmark points 
           where l > d, but l << n. Next the approximate geodesic distance of each point 
           to these l landmark points is calculated. In place of nxn distances only nxl 
           are computed. Then cMDS is run on the lxl distance matrix of the landmark 
           points. Finally the other points are located using a simple formula based on 
           their distances to the landmarks. The complexity of L-Isomap is
           O(d ln log(n) + l^3).

           LLE is based on a substatntially different intuition. The idea comes from 
           visualizing the manifold as a collection of overlapping coordinate patches. If 
           the neighborhood sizes are small and the manifold is is sufficiently smooth, 
           then these patches will be approximately linear. Moreover, the chart from the 
           manifold to R^d will be roughly linear on these patches. The idea is to 
           identify these linear patches, characterize their geometry, and find a mapping 
           to R^d that preserves this local geometry and is nearly linear. It is assumed 
           that these local patches will overlap with one another so that local 
           reconstructions will combine ito a global one.

           The number of neighbors is a parameter of the algorithm.
            
           To characterize the geometry of the linear patches LLE attempts to represent 
           each x_i as a weighted, convex combination of its nearest  neighbors. The 
           weights are chosen to minimize the squared error for each i.

           The weight matrix W is used as a surrogate for the local geometry of the 
           patches. W_i reveals the layout of the points around x_i. Since it is invariant
           to rotations translations and scalings, W_i also characterises the local 
           geometry around x_i for any linear mapping of the neighborhood patch surrounded
           by x_i
           
           Second step of the algorithm finds a configuration in d-dimensions 
           (the dimensionality of the parameter space) whose local geometry is 
           characterized well by W.

           Unlike Isomap, d must be known apriory or estimated.
        }



@article{van2007dimensionality,
  title={{Dimensionality reduction: A comparative review}},
  author={Van Der Maaten, LJP and Postma, EO and Van Den Herik, HJ},
  journal={Preprint},
  year={2007},
  publisher={Citeseer}
}




@article{de2002locally,
  title={{Locally linear embedding for classification}},
  author={de Ridder, D. and Duin, R.P.W.},
  journal={Pattern Recognition Group, Dept. of Imaging Science \& Technology, Delft University of Technology, Delft, The Netherlands, Tech. Rep. PH-2002-01},
  year={2002}
  annote = { 
           Excerpts 
           To find a good LLE mapping, threee parameters have to be set: the 
           dimensionality m to map to, the number of neighbors k to take into account and 
           the regularisation parameter r.

           Mapping quality is quite sensitive to these parameters. If m is set too high, 
           the mapping will enhance noise; if it's too low points will be overlapped. If k
           is too small it the mapping will not reflect any global properties; if it's too
           high, the mapping will lose its nonlinear character and behave like traditional
           PCA, as the entire data set is seen as local neighborhood. Finally if r is set 
           incorrectly, the eigenvalues may not converge.

                      
           }



@article{de2003supervised,
  title={{Supervised locally linear embedding}},
  author={de Ridder, D. and Kouropteva, O. and Okun, O. and Pietikainen, M. and Duin, R.P.W.},
  journal={Lecture Notes in Computer Science},
  pages={333--341},
  year={2003},
  publisher={Springer}
  annote ={
           Excerpts
           Often the ideal decision boundary between different classes in such sets is 
           highly nonlinear. A classifier should therefore have many degrees of freedom, 
           and consequently a large number of parameters. As a result training a 
           classifier on such sets is quite complicated: a large number of parameters has 
           to be estimated using a limited number of samples. The curse of dimensionality.

           One can overcome this problem by first mapping the data to a high dimensional 
           space in which the classes become linearly separable. An alternative is to 
           lower the data dimensionality, rather than increase it; the reduction in 
           parameters one needs to estimate can result in better performance.

           Supervised LLE was introduced to deal with data sets containing multiple
           (often disjoint) manifolds. For fully disjoint manifolds, the local 
           neighborhood of a sample x_i from class c (1 <= c <= C) should be composed of 
           samples belonging to the same class only. This can be achieved by artificially 
           increasing the precalculated distance between samples belonging to different 
           classes, but leaving them unchanged if they belong to the same class.
           
           For 1-SLLE, distances between samples in different classes will be as large as 
           the maximum distance in the entire data set. this means neighburs of a sample 
           in class c will be always picked up from the same class. It is a non 
           parameterised supervised LLE. 

           In contrast alpha-SLLE introduces an aditional parameter alpha which controls 
           the amount of supervision. For 0 < alpha < 1, a mapping is found which 
           preserves some of the manifold structure but introduces separation between 
           classes. This allows supervised data analysis, but may also need to better 
           generalisations than 1-SLLE on previously unseen samples.
             
           }             
}

@article{deloache2004becoming,
  title={{Becoming symbol-minded}},
  author={DeLoache, J.S.},
  journal={Trends in cognitive sciences},
  volume={8},
  number={2},
  pages={66--70},
  year={2004},
  publisher={Elsevier}
  annote = {
           Infants initially accept a wide range of of entities as potential symbols and 
           young children are often confused about the nature of symbol-referent relations.

           A symbol is something that someone intends to represent something other than 
           itself. The component someone points to humans as the symbol species.

           How to define a symbol?
           Disagreements persist about symbol use, including what the symbol properly 
           refers to in the first place. The confusion stems in part from the fact that 
           different theorists have distinguished in very different and inconsistent ways 
           among the terms such as 'sign', 'signal', 'index', 'icon' , and so on.
           Philosopher Peirce and psychologists Bruner, Olver and Greenfield reserve the 
           term for entities that have purely arbitrary, formal and conventional relations
           to whatever they stand for.

           The emergence in evolution of the symbolic capacity irrevovcably transformed 
           our species, vastly expanding our intellectual horizons and making possible the
           cultural transmission of knowledge to succeding generations.

           Defining characteristics of symbols
             1. Symbols represent things. Symbols 'represent'; they refer to, they denote,
                they are about something. They are not merely associated with their 
                referents.
             2. Symbols are general. Virtually anything can be used to refer to anything.
                e.g. 'baby signs' - idiosyncratic gestures used ( and often invented) by 
                the child to refer to to communicate with others, like indicating a dog by
                sticking out the tongue and panting.
             3. Symbols are intentional. A person's intention that one entity represent 
                another is both necessary and sufficient to establish a symbolic relation.
                Nothing is inherently a symbol; only as a result of someone using it with 
                goal of denoting or referring does it take a symbolic role. Infants and 
                toddlers are sensitive to the intentions of other people in interpreting 
                their symbolic activity. When a present adult utters a novel word but 
                children see no novel object for it to refer to, they look around for one 
                or they look enquiringly at the speaker. Children are asked to draw two 
                persons. Their drawings are not very distinguishable. However when latter 
                asked to name the drawings they had produced the children insisted that a 
                given drawing was of whatever they had originally intended it to be.

                Learning symbols-referent relations
                Although infants can percieve a difference between real and depicted 
                objects, they do not understand the significance of that difference, so 
                they investigate.

                A vital function of symbols is to enable humans to acquire information 
                without direct experience. Our vast stores of cultural knowledge exist 
                only because we can learn indirectly through symbolic representations.

                Dual nature of symbols
                A unique aspect of symbolic objects is their inherently dual nature. A 
                symbolic artefact such as a picture or a model is both a concrete object 
                and a representation of something other than itself. To use such objects 
                effectively one must achieve dual representation, that is one mentally 
                represent the concrete object itself and its abstract relation to what it 
                stands for.
           }
}


@article{gobet2001chunking,
  title={{Chunking mechanisms in human learning}},
  author={Gobet, F. and Lane, P.C.R. and Croker, S. and Cheng, P.C.H. and Jones, G. and Oliver, I. and Pine, J.M.},
  journal={Trends in Cognitive Sciences},
  volume={5},
  number={6},
  pages={236--243},
  year={2001},
  publisher={Elsevier}
  annote =      {
                This article addresses the question of how chunk might be represented 
                within a cognitive system.
 
                A chunk is a collection of elements having strong associations with one 
                another, but weak associations with elements in other chunks.
           
                Some of the clearest evidence of perceptual chunking is found in how 
                primitive stimuli are grouped together into larger conceptual groupes, 
                such as the manner in which letters are grouped into words, sentnces or 
                even paragraphs. This grouping leads to memory and behavioural effects, in
                which the latencies for output of items comprising a chunk are shorter 
                than when those items comprise a number of smaller chunks. 
 
                The manner in which chunks are constructed affects the type of 
                generalizations and so predicts typical errors or successes.

                Elementary perciever and memorizer (EPAM)
                Learning is simulated by the growth of a discrimination network, where the
                internal nodes test for the presence of perceptual features, and the leaf 
                nodes store 'images', the internal representation of the external objects.
                EPAM has a limited short term memory.

                Iformation flows through EPAM as follows. First a stimulus is percieved 
                and converted into a set of features. Second, these features are sorted by
                the tests of the discrimination network, to retrrieve a pinter to a node 
                within LTM; this pointer is then stored within STM. Third depending on an 
                internal comparison process, learning may or may not occur within the 
                network. Finally, an action might be taken by the system or next stimulus 
                is retrieved from the environment. The extent of system's knowledge about 
                a given stimulus is indicated by the leaf nide reached after sorting that 
                stimulus through the discrimination network. Learning occurs by comparing 
                the information held in the leaf node with that in the stimulus. If there 
                is a perfect match, no learning occurs. If the image is the subset of the 
                stimulus additional information is added t to the leaf node. Finally if 
                there is a mismatch, the network is augmented: the leaf node becomes an 
                internal node with a test for the mismatch , and the stimmulus and the old
                image are used ad the basis of new leaf nodes.
          
                Chunk Heirarchy and Retrieval structures -- CHREST
                It features a number of additions to EPAM's basic learning mechanisms, 
                providing a greater degree of self-organization and adaptation to complex 
                data. One small addition is that every node within CHREST's discrimination
                network can contain an image.

                The major change is in the form of creating lateral links between nodes, 
                and for elaborating information within chunks to form more complex 
                schemata; these changes improve tehrichness of the semantic memory without
                affecting the important properties of the previous simulations.
          }
           
           
}


@article{chandrasekaran1999ontologies,
  title={{What are ontologies, and why do we need them?}},
  author={Chandrasekaran, B. and Josephson, J.R. and Benjamins, V.R.},
  journal={IEEE Intelligent systems},
  volume={14},
  number={1},
  pages={20--26},
  year={1999},
  publisher={Citeseer}
  annote =      {
                Excerpts:
                In ontology is the study of the kinds of things that exist. It is often 
                said that ontologies "carve the world at its joints." In AI the term 
                ontology has come to mean one of two related things. First of all 
                ontologyo is a representation voculabory, often specialized to some domain
                or subject matter. More precisely, it is not the vocabulary as such that 
                qualifies as ontology, but conceptualizations that the terms init the 
                vocabulary are intended to capture. Thus, translating the terms in an 
                otnology from one language to another doesn't change the ontology 
                conceptually.
                
                In its second sense the term ontology is sometimes used to refer to a body
                of knowledge describing some domain, typically a common sense knowledge 
                domain, using a representation vocabulry
}


